vmd.c revision 1.3
1/*	$OpenBSD: vmd.c,v 1.3 2015/11/22 22:29:48 deraadt Exp $	*/
2
3/*
4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19/*
20 * vmd(8) - virtual machine daemon
21 */
22
23#include <sys/types.h>
24#include <sys/ioctl.h>
25#include <sys/queue.h>
26#include <sys/uio.h>
27#include <sys/socket.h>
28#include <sys/stat.h>
29#include <sys/un.h>
30#include <sys/wait.h>
31#include <sys/mman.h>
32#include <sys/time.h>
33
34#include <dev/ic/comreg.h>
35#include <dev/ic/i8253reg.h>
36#include <dev/isa/isareg.h>
37#include <dev/pci/pcireg.h>
38
39#include <machine/param.h>
40#include <machine/vmmvar.h>
41
42#include <err.h>
43#include <errno.h>
44#include <fcntl.h>
45#include <imsg.h>
46#include <limits.h>
47#include <pthread.h>
48#include <pwd.h>
49#include <signal.h>
50#include <stddef.h>
51#include <stdio.h>
52#include <stdlib.h>
53#include <string.h>
54#include <termios.h>
55#include <unistd.h>
56#include <util.h>
57
58#include "vmd.h"
59#include "loadfile.h"
60#include "pci.h"
61#include "virtio.h"
62
63#define NR_BACKLOG 5
64
65#define MAX_TAP 256
66
67/*
68 * Emulated 8250 UART
69 *
70 */
71#define COM1_DATA	0x3f8
72#define COM1_IER	0x3f9
73#define COM1_IIR	0x3fa
74#define COM1_LCR	0x3fb
75#define COM1_MCR	0x3fc
76#define COM1_LSR	0x3fd
77#define COM1_MSR	0x3fe
78#define COM1_SCR	0x3ff
79
80/*
81 * Emulated i8253 PIT (counter)
82 */
83#define TIMER_BASE	0x40
84#define TIMER_CTRL	0x43	/* 8253 Timer #1 */
85#define NS_PER_TICK (1000000000 / TIMER_FREQ)
86
87/* i8253 registers */
88struct i8253_counter {
89	struct timeval tv;	/* timer start time */
90	uint16_t start;		/* starting value */
91	uint16_t olatch;	/* output latch */
92	uint16_t ilatch;	/* input latch */
93	uint8_t last_r;		/* last read byte (MSB/LSB) */
94	uint8_t last_w;		/* last written byte (MSB/LSB) */
95};
96
97/* ns8250 UART registers */
98struct ns8250_regs {
99	uint8_t lcr;		/* Line Control Register */
100	uint8_t fcr;		/* FIFO Control Register */
101	uint8_t iir;		/* Interrupt ID Register */
102	uint8_t ier;		/* Interrupt Enable Register */
103	uint8_t divlo;		/* Baud rate divisor low byte */
104	uint8_t divhi;		/* Baud rate divisor high byte */
105	uint8_t msr;		/* Modem Status Register */
106	uint8_t lsr;		/* Line Status Register */
107	uint8_t mcr;		/* Modem Control Register */
108	uint8_t scr;		/* Scratch Register */
109	uint8_t data;		/* Unread input data */
110};
111
112struct i8253_counter i8253_counter[3];
113struct ns8250_regs com1_regs;
114
115void sighdlr(int);
116int main(int, char **);
117int control_run(void);
118int disable_vmm(void);
119int enable_vmm(void);
120int start_vm(struct imsg *);
121int terminate_vm(struct imsg *);
122int get_info_vm(struct imsgbuf *);
123int start_client_vmd(void);
124int opentap(void);
125int run_vm(int *, int *, struct vm_create_params *);
126void *vcpu_run_loop(void *);
127int vcpu_exit(struct vm_run_params *);
128int vmm_create_vm(struct vm_create_params *);
129void init_emulated_hw(struct vm_create_params *, int *, int *);
130void vcpu_exit_inout(struct vm_run_params *);
131uint8_t vcpu_exit_pci(struct vm_run_params *);
132void vcpu_exit_i8253(union vm_exit *);
133void vcpu_exit_com(struct vm_run_params *);
134void vcpu_process_com_data(union vm_exit *);
135void vcpu_process_com_lcr(union vm_exit *);
136void vcpu_process_com_lsr(union vm_exit *);
137void vcpu_process_com_ier(union vm_exit *);
138void vcpu_process_com_mcr(union vm_exit *);
139void vcpu_process_com_iir(union vm_exit *);
140void vcpu_process_com_msr(union vm_exit *);
141void vcpu_process_com_scr(union vm_exit *);
142
143int vmm_fd, con_fd, vm_id;
144volatile sig_atomic_t quit;
145
146SLIST_HEAD(vmstate_head, vmstate);
147struct vmstate_head vmstate;
148
149extern char *__progname;
150
151/*
152 * sighdlr
153 *
154 * Signal handler for TERM/INT/CHLD signals used during daemon shutdown
155 *
156 * Parameters:
157 *  sig: signal caught
158 */
159void
160sighdlr(int sig)
161{
162	switch (sig) {
163	case SIGTERM:
164	case SIGINT:
165		/* Tell main imsg loop to exit */
166		quit = 1;
167		break;
168	case SIGCHLD:
169		while (waitpid(WAIT_ANY, 0, WNOHANG) > 0) {}
170		break;
171	}
172}
173
174int
175main(int argc, char **argv)
176{
177	int res;
178
179	/* Open /dev/vmm */
180	vmm_fd = open(VMM_NODE, O_RDONLY);
181	if (vmm_fd == -1)
182		errx(1, "can't open vmm device node %s", VMM_NODE);
183
184	setproctitle("control");
185
186	SLIST_INIT(&vmstate);
187
188	signal(SIGTERM, sighdlr);
189	signal(SIGINT, sighdlr);
190	signal(SIGCHLD, sighdlr);
191
192	if (daemon(0, 1) == -1)
193		errx(1, "can't daemonize\n");
194
195	res = control_run();
196
197	if (res == -1)
198		errx(1, "control socket error\n");
199
200	return (0);
201}
202
203/*
204 * control_run
205 *
206 * Main control loop - establishes listening socket for incoming vmmctl(8)
207 * requests and dispatches appropriate calls to vmm(4). Replies to
208 * vmmctl(8) using imsg.
209 *
210 * Return values:
211 *  0: normal exit (signal to quit received)
212 *  -1: abnormal exit (various causes)
213 */
214int
215control_run(void)
216{
217	struct sockaddr_un sun, c_sun;
218	socklen_t len;
219	int fd, connfd, n, res;
220	mode_t mode, old_umask;
221	char *socketpath;
222	struct imsgbuf *ibuf;
223	struct imsg imsg;
224
225	/* Establish and start listening on control socket */
226	socketpath = SOCKET_NAME;
227	if ((fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0)) == -1) {
228		fprintf(stderr, "%s: socket error\n", __progname);
229		return (-1);
230	}
231
232	bzero(&sun, sizeof(sun));
233	sun.sun_family = AF_UNIX;
234	if (strlcpy(sun.sun_path, socketpath, sizeof(sun.sun_path)) >=
235	    sizeof(sun.sun_path)) {
236		fprintf(stderr, "%s: socket name too long\n", __progname);
237		close(fd);
238		return (-1);
239	}
240
241	if (unlink(socketpath) == -1)
242		if (errno != ENOENT) {
243			fprintf(stderr, "%s: unlink of %s failed\n",
244			    __progname, socketpath);
245			close(fd);
246			return (-1);
247		}
248
249	old_umask = umask(S_IXUSR|S_IXGRP|S_IWOTH|S_IROTH|S_IXOTH);
250	mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP;
251
252	if (bind(fd, (struct sockaddr *)&sun, sizeof(sun)) == -1) {
253		fprintf(stderr, "%s: control_init: bind of %s failed\n",
254		    __progname, socketpath);
255		close(fd);
256		umask(old_umask);
257		return (-1);
258	}
259
260	umask(old_umask);
261
262	if (chmod(socketpath, mode) == -1) {
263		fprintf(stderr, "%s: control_init: chmod of %s failed\n",
264		    __progname, socketpath);
265		close(fd);
266		unlink(socketpath);
267		return (-1);
268	}
269
270	if ((ibuf = malloc(sizeof(struct imsgbuf))) == NULL) {
271		fprintf(stderr, "%s: out of memory\n", __progname);
272		close(fd);
273		unlink(socketpath);
274		return (-1);
275	}
276
277	if (listen(fd, NR_BACKLOG) == -1) {
278		fprintf(stderr, "%s: listen failed\n", __progname);
279		close(fd);
280		unlink(socketpath);
281		return (-1);
282	}
283
284	while (!quit) {
285		if ((connfd = accept4(fd, (struct sockaddr *)&c_sun, &len,
286		    SOCK_CLOEXEC)) == -1) {
287			fprintf(stderr, "%s: accept4 error\n", __progname);
288			close(fd);
289			unlink(socketpath);
290			return (-1);
291		}
292
293		imsg_init(ibuf, connfd);
294		if ((n = imsg_read(ibuf)) == -1 || n == 0) {
295			fprintf(stderr, "%s: imsg_read error, n=%d\n",
296			    __progname, n);
297			continue;
298		}
299
300		for (;;) {
301			if ((n = imsg_get(ibuf, &imsg)) == -1)
302				return (-1);
303
304			if (n == 0)
305				break;
306
307			/* Process incoming message (from vmmctl(8)) */
308			switch (imsg.hdr.type) {
309			case IMSG_VMDOP_DISABLE_VMM_REQUEST:
310				res = disable_vmm();
311				imsg_compose(ibuf,
312				    IMSG_VMDOP_DISABLE_VMM_RESPONSE, 0, 0, -1,
313				    &res, sizeof(res));
314				break;
315			case IMSG_VMDOP_ENABLE_VMM_REQUEST:
316				res = enable_vmm();
317				imsg_compose(ibuf,
318				    IMSG_VMDOP_ENABLE_VMM_RESPONSE, 0, 0, -1,
319				    &res, sizeof(res));
320				break;
321			case IMSG_VMDOP_START_VM_REQUEST:
322				res = start_vm(&imsg);
323				imsg_compose(ibuf,
324				    IMSG_VMDOP_START_VM_RESPONSE, 0, 0, -1,
325				    &res, sizeof(res));
326				break;
327			case IMSG_VMDOP_TERMINATE_VM_REQUEST:
328				res = terminate_vm(&imsg);
329				imsg_compose(ibuf,
330				    IMSG_VMDOP_TERMINATE_VM_RESPONSE, 0, 0, -1,
331				    &res, sizeof(res));
332				break;
333			case IMSG_VMDOP_GET_INFO_VM_REQUEST:
334				res = get_info_vm(ibuf);
335				imsg_compose(ibuf,
336				    IMSG_VMDOP_GET_INFO_VM_END_DATA, 0, 0, -1,
337				    &res, sizeof(res));
338				break;
339			}
340
341			while (ibuf->w.queued)
342				if (msgbuf_write(&ibuf->w) <= 0 && errno !=
343				    EAGAIN) {
344					fprintf(stderr, "%s: msgbuf_write "
345					    "error %d\n", __progname,
346					    errno);
347					close(fd);
348					close(connfd);
349					unlink(socketpath);
350					return (-1);
351				}
352			imsg_free(&imsg);
353		}
354		close(connfd);
355	}
356
357	signal(SIGCHLD, SIG_IGN);
358
359	return (0);
360}
361
362/*
363 * disable_vmm
364 *
365 * Disables VMM mode on all CPUs
366 *
367 * Return values:
368 *  0: success
369 *  !0 : ioctl to vmm(4) failed
370 */
371int
372disable_vmm(void)
373{
374	if (ioctl(vmm_fd, VMM_IOC_STOP, NULL) < 0)
375		return (errno);
376
377	return (0);
378}
379
380/*
381 * enable_vmm
382 *
383 * Enables VMM mode on all CPUs
384 *
385 * Return values:
386 *  0: success
387 *  !0 : ioctl to vmm(4) failed
388 */
389int
390enable_vmm(void)
391{
392	if (ioctl(vmm_fd, VMM_IOC_START, NULL) < 0)
393		return (errno);
394
395	return (0);
396}
397
398/*
399 * terminate_vm
400 *
401 * Requests vmm(4) to terminate the VM whose ID is provided in the
402 * supplied vm_terminate_params structure (vtp->vtp_vm_id)
403 *
404 * Parameters
405 *  imsg: The incoming imsg body whose 'data' field contains the
406 *      vm_terminate_params struct
407 *
408 * Return values:
409 *  0: success
410 *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM is not
411 *      valid)
412 */
413int
414terminate_vm(struct imsg *imsg)
415{
416	struct vm_terminate_params *vtp;
417
418	vtp = (struct vm_terminate_params *)imsg->data;
419
420	if (ioctl(vmm_fd, VMM_IOC_TERM, vtp) < 0)
421                return (errno);
422
423	return (0);
424}
425
426/*
427 * opentap
428 *
429 * Opens the next available tap device, up to MAX_TAP.
430 *
431 * Returns a file descriptor to the tap node opened, or -1 if no tap
432 * devices were available.
433 */
434int
435opentap(void)
436{
437	int i, fd;
438	char path[PATH_MAX];
439
440	for (i = 0; i < MAX_TAP; i++) {
441		snprintf(path, PATH_MAX, "/dev/tap%d", i);
442		fd = open(path, O_RDWR | O_NONBLOCK);
443		if (fd != -1)
444			return (fd);
445	}
446
447	return (-1);
448}
449
450/*
451 * start_vm
452 *
453 * Starts a new VM with the creation parameters supplied (in the incoming
454 * imsg->data field). This function performs a basic sanity check on the
455 * incoming parameters and then performs the following steps to complete
456 * the creation of the VM:
457 *
458 * 1. opens the VM disk image files specified in the VM creation parameters
459 * 2. opens the specified VM kernel
460 * 3. creates a VM console tty pair using openpty
461 * 4. forks, passing the file descriptors opened in steps 1-3 to the child
462 *     vmd responsible for dropping privilege and running the VM's VCPU
463 *     loops.
464 *
465 * Parameters:
466 *  imsg: The incoming imsg body whose 'data' field is a vm_create_params
467 *      struct containing the VM creation parameters.
468 *
469 * Return values:
470 *  0: success
471 *  !0 : failure - typically an errno indicating the source of the failure
472 */
473int
474start_vm(struct imsg *imsg)
475{
476	struct vm_create_params *vcp;
477	size_t i;
478	off_t kernel_size;
479	struct stat sb;
480	int child_disks[VMM_MAX_DISKS_PER_VM], kernel_fd, ret, ttym_fd;
481	int child_taps[VMM_MAX_NICS_PER_VM];
482	int ttys_fd;
483	char ptyn[32];
484
485	vcp = (struct vm_create_params *)imsg->data;
486
487	for (i = 0 ; i < VMM_MAX_DISKS_PER_VM; i++)
488		child_disks[i] = -1;
489	for (i = 0 ; i < VMM_MAX_NICS_PER_VM; i++)
490		child_taps[i] = -1;
491
492	/*
493	 * XXX kernel_fd can't be global (possible race if multiple VMs
494	 * being created at the same time). Probably need to move this
495	 * into the child before dropping privs, or just make it local
496	 * to this function?
497	 */
498	kernel_fd = -1;
499
500	ttym_fd = -1;
501	ttys_fd = -1;
502
503	/* Open disk images for child */
504	for (i = 0 ; i < vcp->vcp_ndisks; i++) {
505		child_disks[i] = open(vcp->vcp_disks[i], O_RDWR);
506		if (child_disks[i] == -1) {
507			ret = errno;
508			fprintf(stderr, "%s: can't open %s (%d)\n", __progname,
509			    vcp->vcp_disks[i], errno);
510			goto err;
511		}
512	}
513
514	bzero(&sb, sizeof(sb));
515	if (stat(vcp->vcp_kernel, &sb) == -1) {
516		ret = errno;
517		fprintf(stderr, "%s: can't stat kernel image %s (%d)\n",
518		    __progname, vcp->vcp_kernel, errno);
519		goto err;
520	}
521
522	kernel_size = sb.st_size;
523
524	/* Open kernel image */
525	kernel_fd = open(vcp->vcp_kernel, O_RDONLY);
526	if (kernel_fd == -1) {
527		ret = errno;
528		fprintf(stderr, "%s: can't open kernel image %s (%d)\n",
529		    __progname, vcp->vcp_kernel, errno);
530		goto err;
531	}
532
533	if (openpty(&ttym_fd, &ttys_fd, ptyn, NULL, NULL) == -1) {
534		ret = errno;
535		fprintf(stderr, "%s: openpty failed: %d\n",
536		    __progname, errno);
537		goto err;
538	}
539
540	if (close(ttys_fd)) {
541		ret = errno;
542		fprintf(stderr, "%s: close tty failed: %d\n",
543		    __progname, errno);
544		goto err;
545	}
546
547	/* Open tap devices for child */
548	for (i = 0 ; i < vcp->vcp_nnics; i++) {
549		child_taps[i] = opentap();
550		if (child_taps[i] == -1) {
551			ret = errno;
552			fprintf(stderr, "%s: can't open tap for nic %zd (%d)\n",
553			    __progname, i, errno);
554			goto err;
555		}
556	}
557
558	/* Start child vmd for this VM (fork, chroot, drop privs) */
559	ret = start_client_vmd();
560
561	/* Start child failed? - cleanup and leave */
562	if (ret == -1) {
563		ret = EIO;
564		goto err;
565	}
566
567	if (ret > 0) {
568		/* Parent */
569		for (i = 0 ; i < vcp->vcp_ndisks; i++)
570			close(child_disks[i]);
571
572		for (i = 0 ; i < vcp->vcp_nnics; i++)
573			close(child_taps[i]);
574
575		close(kernel_fd);
576		close(ttym_fd);
577
578		return (0);
579	}
580	else {
581		/* Child */
582		fprintf(stderr, "%s: vm console: %s\n", __progname, ptyn);
583		ret = vmm_create_vm(vcp);
584		setproctitle(vcp->vcp_name);
585		if (ret) {
586			fprintf(stderr, "%s: create vmm ioctl failed - "
587			    "exiting (%d)\n", __progname, ret);
588			_exit(1);
589		}
590
591		/* Load kernel image */
592		ret = loadelf_main(kernel_fd, vcp->vcp_id, vcp->vcp_memory_size);
593		if (ret) {
594			fprintf(stderr, "%s: failed to load kernel - "
595			    "exiting (%d)\n", __progname, ret);
596			_exit(1);
597		}
598
599		close(kernel_fd);
600
601		con_fd = ttym_fd;
602		if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) {
603			fprintf(stderr, "%s: failed to set nonblocking mode "
604			    "on console\n", __progname);
605			_exit(1);
606		}
607
608		/* Execute the vcpu run loop(s) for this VM */
609		ret = run_vm(child_disks, child_taps, vcp);
610		_exit(ret != 0);
611	}
612
613	return (ret);
614
615err:
616	for (i = 0 ; i < vcp->vcp_ndisks; i++)
617		if (child_disks[i] != -1)
618			close(child_disks[i]);
619
620	for (i = 0 ; i < vcp->vcp_nnics; i++)
621		if (child_taps[i] != -1)
622			close(child_taps[i]);
623
624	if (kernel_fd != -1)
625		close(kernel_fd);
626
627	if (ttym_fd != -1)
628		close(ttym_fd);
629
630	return (ret);
631}
632
633/*
634 * get_info_vm
635 *
636 * Returns a list of VMs known to vmm(4).
637 *
638 * Parameters:
639 *  ibuf: the imsg ibuf in which to place the results. A new imsg will
640 *      be created using this ibuf.
641 *
642 * Return values:
643 *  0: success
644 *  !0 : failure (eg, ENOMEM, EIO or another error code from vmm(4) ioctl)
645 */
646int
647get_info_vm(struct imsgbuf *ibuf)
648{
649	int ret;
650	size_t ct, i;
651	struct ibuf *obuf;
652	struct vm_info_params vip;
653	struct vm_info_result *info;
654
655	/*
656	 * We issue the VMM_IOC_INFO ioctl twice, once with an input
657	 * buffer size of 0, which results in vmm(4) returning the
658	 * number of bytes required back to us in vip.vip_size,
659	 * and then we call it again after malloc'ing the required
660	 * number of bytes.
661	 *
662	 * It is possible that we could fail a second time (eg, if
663	 * another VM was created in the instant between the two
664	 * ioctls, but in that case the caller can just try again
665	 * as vmm(4) will return a zero-sized list in that case.
666	 */
667	vip.vip_size = 0;
668	info = NULL;
669	ret = 0;
670
671	/* First ioctl to see how many bytes needed (vip.vip_size) */
672	if (ioctl(vmm_fd, VMM_IOC_INFO, &vip) < 0)
673		return (errno);
674
675	if (vip.vip_info_ct != 0)
676		return (EIO);
677
678	info = malloc(vip.vip_size);
679	if (info == NULL)
680		return (ENOMEM);
681
682	/* Second ioctl to get the actual list */
683	vip.vip_info = info;
684	if (ioctl(vmm_fd, VMM_IOC_INFO, &vip) < 0) {
685		ret = errno;
686		free(info);
687		return (ret);
688	}
689
690	/* Return info to vmmctl(4) */
691	ct = vip.vip_size / sizeof(struct vm_info_result);
692	for (i = 0; i < ct; i++) {
693		obuf = imsg_create(ibuf, IMSG_VMDOP_GET_INFO_VM_DATA, 0, 0,
694		    sizeof(struct vm_info_result));
695		imsg_add(obuf, &info[i], sizeof(struct vm_info_result));
696		imsg_close(ibuf, obuf);
697	}
698	free(info);
699	return (0);
700}
701
702
703/*
704 * start_client_vmd
705 *
706 * forks a copy of the parent vmd, chroots to VMD_USER's home, drops
707 * privileges (changes to user VMD_USER), and returns.
708 * Should the fork operation succeed, but later chroot/privsep
709 * fail, the child exits.
710 *
711 * Return values (returns to both child and parent on success):
712 *  -1 : failure
713 *  0: return to child vmd returns 0
714 *  !0 : return to parent vmd returns the child's pid
715 */
716int
717start_client_vmd(void)
718{
719	int child_pid;
720	struct passwd *pw;
721
722	pw = getpwnam(VMD_USER);
723	if (pw == NULL) {
724		fprintf(stderr, "%s: no such user %s\n", __progname, VMD_USER);
725		return (-1);
726	}
727
728	child_pid = fork();
729	if (child_pid < 0)
730		return (-1);
731
732	if (!child_pid) {
733		/* Child */
734		if (chroot(pw->pw_dir) != 0)
735			err(1, "unable to chroot");
736		if (chdir("/") != 0)
737			err(1, "unable to chdir");
738
739		if (setgroups(1, &pw->pw_gid) == -1)
740			err(1, "setgroups() failed");
741		if (setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) == -1)
742			err(1, "setresgid() failed");
743		if (setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid) == -1)
744			err(1, "setresuid() failed");
745
746		return (0);
747	}
748
749	/* Parent */
750	return (child_pid);
751}
752
753/*
754 * vmm_create_vm
755 *
756 * Requests vmm(4) to create a new VM using the supplied creation
757 * parameters. This operation results in the creation of the in-kernel
758 * structures for the VM, but does not start the VM's vcpu(s).
759 *
760 * Parameters:
761 *  vcp: vm_create_params struct containing the VM's desired creation
762 *      configuration
763 *
764 * Return values:
765 *  0: success
766 *  !0 : ioctl to vmm(4) failed
767 */
768int
769vmm_create_vm(struct vm_create_params *vcp)
770{
771	/* Sanity check arguments */
772	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
773		return (EINVAL);
774
775	if (vcp->vcp_memory_size > VMM_MAX_VM_MEM_SIZE)
776		return (EINVAL);
777
778	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
779		return (EINVAL);
780
781	if (ioctl(vmm_fd, VMM_IOC_CREATE, vcp) < 0)
782		return (errno);
783
784	return (0);
785}
786
787/*
788 * init_emulated_hw
789 *
790 * Initializes the userspace hardware emulation
791 */
792void
793init_emulated_hw(struct vm_create_params *vcp, int *child_disks,
794    int *child_taps)
795{
796	/* Init the i8253 PIT's 3 counters */
797	bzero(&i8253_counter, sizeof(struct i8253_counter) * 3);
798	gettimeofday(&i8253_counter[0].tv, NULL);
799	gettimeofday(&i8253_counter[1].tv, NULL);
800	gettimeofday(&i8253_counter[2].tv, NULL);
801	i8253_counter[0].start = TIMER_DIV(100);
802	i8253_counter[1].start = TIMER_DIV(100);
803	i8253_counter[2].start = TIMER_DIV(100);
804
805	/* Init ns8250 UART */
806	bzero(&com1_regs, sizeof(struct ns8250_regs));
807
808	/* Initialize PCI */
809	pci_init();
810
811	/* Initialize virtio devices */
812	virtio_init(vcp, child_disks, child_taps);
813}
814
815/*
816 * run_vm
817 *
818 * Runs the VM whose creation parameters are specified in vcp
819 *
820 * Parameters:
821 *  vcp: vm_create_params struct containing the VM's desired creation
822 *      configuration
823 *  child_disks: previously-opened child VM disk file file descriptors
824 *  child_taps: previously-opened child tap file descriptors
825 *
826 * Return values:
827 *  0: the VM exited normally
828 *  !0 : the VM exited abnormally or failed to start
829 */
830int
831run_vm(int *child_disks, int *child_taps, struct vm_create_params *vcp)
832{
833	size_t i;
834	int ret;
835	pthread_t *tid;
836	void *exit_status;
837	struct vm_run_params **vrp;
838
839	ret = 0;
840
841	/* XXX cap vcp_ncpus to avoid overflow here */
842	/*
843	 * XXX ensure nvcpus in vcp is same as vm, or fix vmm to return einval
844	 * on bad vcpu id
845	 */
846	tid = malloc(sizeof(pthread_t) * vcp->vcp_ncpus);
847	vrp = malloc(sizeof(struct vm_run_params *) * vcp->vcp_ncpus);
848	if (tid == NULL || vrp == NULL) {
849		fprintf(stderr, "%s: memory allocation error - exiting.\n",
850		    __progname);
851		return (ENOMEM);
852	}
853
854	init_emulated_hw(vcp, child_disks, child_taps);
855
856	/*
857	 * Create and launch one thread for each VCPU. These threads may
858	 * migrate between PCPUs over time; the need to reload CPU state
859	 * in such situations is detected and performed by vmm(4) in the
860	 * kernel.
861	 */
862	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
863		vrp[i] = malloc(sizeof(struct vm_run_params));
864		if (vrp[i] == NULL) {
865			fprintf(stderr, "%s: memory allocation error - "
866			    "exiting.\n", __progname);
867			/* caller will exit, so skip free'ing */
868			return (ENOMEM);
869		}
870		vrp[i]->vrp_exit = malloc(sizeof(union vm_exit));
871		if (vrp[i]->vrp_exit == NULL) {
872			fprintf(stderr, "%s: memory allocation error - "
873			    "exiting.\n", __progname);
874			/* caller will exit, so skip free'ing */
875			return (ENOMEM);
876		}
877		vrp[i]->vrp_vm_id = vcp->vcp_id;
878		vrp[i]->vrp_vcpu_id = i;
879
880		/* Start each VCPU run thread at vcpu_run_loop */
881		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
882		if (ret) {
883			/* caller will _exit after this return */
884			return (ret);
885		}
886	}
887
888	/* Wait for all the threads to exit */
889	for (i = 0; i < vcp->vcp_ncpus; i++) {
890		if (pthread_join(tid[i], &exit_status)) {
891			fprintf(stderr, "%s: failed to join thread %zd - "
892			    "exiting\n", __progname, i);
893			return (EIO);
894		}
895
896		if (exit_status != NULL) {
897			fprintf(stderr, "%s: vm %d vcpu run thread %zd exited "
898			    "abnormally\n", __progname, vcp->vcp_id, i);
899			ret = EIO;
900		}
901	}
902
903	return (ret);
904}
905
906/*
907 * vcpu_run_loop
908 *
909 * Runs a single VCPU until vmm(4) requires help handling an exit,
910 * or the VM terminates.
911 *
912 * Parameters:
913 *  arg: vcpu_run_params for the VCPU being run by this thread
914 *
915 * Return values:
916 *  NULL: the VCPU shutdown properly
917 *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
918 */
919void *
920vcpu_run_loop(void *arg)
921{
922	struct vm_run_params *vrp = (struct vm_run_params *)arg;
923	intptr_t ret;
924
925	vrp->vrp_continue = 0;
926	vrp->vrp_injint = -1;
927
928	for (;;) {
929		if (ioctl(vmm_fd, VMM_IOC_RUN, vrp) < 0) {
930			/* If run ioctl failed, exit */
931			ret = errno;
932			return ((void *)ret);
933		}
934
935		/* If the VM is terminating, exit normally */
936		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED)
937			return (NULL);
938
939		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
940			/*
941			 * vmm(4) needs help handling an exit, handle in
942			 * vcpu_exit.
943			 */
944			if (vcpu_exit(vrp))
945				return ((void *)EIO);
946		}
947	}
948
949	return (NULL);
950}
951
952/*
953 * vcpu_exit_i8253
954 *
955 * Handles emulated i8253 PIT access (in/out instruction to PIT ports).
956 * We don't emulate all the modes of the i8253, just the basic squarewave
957 * clock.
958 *
959 * Parameters:
960 *  vei: VM exit information from vmm(4) containing information on the in/out
961 *      instruction being performed
962 */
963void
964vcpu_exit_i8253(union vm_exit *vei)
965{
966	uint32_t out_data;
967	uint8_t sel, rw, data;
968	uint64_t ns, ticks;
969	struct timeval now, delta;
970
971	if (vei->vei.vei_port == TIMER_CTRL) {
972		if (vei->vei.vei_dir == 0) { /* OUT instruction */
973			out_data = vei->vei.vei_data;
974			sel = out_data &
975			    (TIMER_SEL0 | TIMER_SEL1 | TIMER_SEL2);
976			sel = sel >> 6;
977			if (sel > 2) {
978				fprintf(stderr, "%s: i8253 PIT: invalid "
979				    "timer selected (%d)\n",
980				    __progname, sel);
981				return;
982			}
983
984			rw = vei->vei.vei_data &
985			    (TIMER_LATCH | TIMER_LSB |
986			    TIMER_MSB | TIMER_16BIT);
987
988			if (rw == TIMER_16BIT) {
989				/*
990				 * XXX this seems to be used on occasion, needs
991				 * to be implemented
992				 */
993				fprintf(stderr, "%s: i8253 PIT: 16 bit "
994				    "counter I/O not supported\n",
995				    __progname);
996				    return;
997			}
998
999			/*
1000			 * Since we don't truly emulate each tick of the PIT
1001			 * clock, when the guest asks for the timer to be
1002			 * latched, simulate what the counter would have been
1003			 * had we performed full emulation. We do this by
1004			 * calculating when the counter was reset vs how much
1005			 * time has elapsed, then bias by the counter tick
1006			 * rate.
1007			 */
1008			if (rw == TIMER_LATCH) {
1009				gettimeofday(&now, NULL);
1010				delta.tv_sec = now.tv_sec -
1011				    i8253_counter[sel].tv.tv_sec;
1012				delta.tv_usec = now.tv_usec -
1013				    i8253_counter[sel].tv.tv_usec;
1014				if (delta.tv_usec < 0) {
1015					delta.tv_sec--;
1016					delta.tv_usec += 1000000;
1017				}
1018				if (delta.tv_usec > 1000000) {
1019					delta.tv_sec++;
1020					delta.tv_usec -= 1000000;
1021				}
1022				ns = delta.tv_usec * 1000 +
1023				    delta.tv_sec * 1000000000;
1024				ticks = ns / NS_PER_TICK;
1025				i8253_counter[sel].olatch =
1026				    i8253_counter[sel].start -
1027				    ticks % i8253_counter[sel].start;
1028				return;
1029			}
1030
1031			fprintf(stderr, "%s: i8253 PIT: unsupported rw mode "
1032			    "%d\n", __progname, rw);
1033			return;
1034		} else {
1035			/* XXX should this return 0xff? */
1036			fprintf(stderr, "%s: i8253 PIT: read from control "
1037			    "port unsupported\n", __progname);
1038		}
1039	} else {
1040		sel = vei->vei.vei_port - (TIMER_CNTR0 + TIMER_BASE);
1041		if (vei->vei.vei_dir == 0) { /* OUT instruction */
1042			if (i8253_counter[sel].last_w == 0) {
1043				out_data = vei->vei.vei_data;
1044				i8253_counter[sel].ilatch |= (out_data << 8);
1045				i8253_counter[sel].last_w = 1;
1046			} else {
1047				out_data = vei->vei.vei_data;
1048				i8253_counter[sel].ilatch |= out_data;
1049				i8253_counter[sel].start =
1050				    i8253_counter[sel].ilatch;
1051				i8253_counter[sel].last_w = 0;
1052			}
1053		} else {
1054			if (i8253_counter[sel].last_r == 0) {
1055				data = i8253_counter[sel].olatch >> 8;
1056				vei->vei.vei_data = data;
1057				i8253_counter[sel].last_w = 1;
1058			} else {
1059				data = i8253_counter[sel].olatch & 0xFF;
1060				vei->vei.vei_data = data;
1061				i8253_counter[sel].last_w = 0;
1062			}
1063		}
1064	}
1065}
1066
1067/*
1068 * vcpu_process_com_data
1069 *
1070 * Emulate in/out instructions to the com1 (ns8250) UART data register
1071 *
1072 * Parameters:
1073 *  vei: vm exit information from vmm(4) containing information on the in/out
1074 *      instruction being performed
1075 */
1076void
1077vcpu_process_com_data(union vm_exit *vei)
1078{
1079	/*
1080	 * vei_dir == 0 : out instruction
1081	 *
1082	 * The guest wrote to the data register. Since we are emulating a
1083	 * no-fifo chip, write the character immediately to the pty and
1084	 * assert TXRDY in IIR (if the guest has requested TXRDY interrupt
1085	 * reporting)
1086	 */
1087	if (vei->vei.vei_dir == 0) {
1088		write(con_fd, &vei->vei.vei_data, 1);
1089		if (com1_regs.ier & 0x2) {
1090			/* Set TXRDY */
1091			com1_regs.iir |= IIR_TXRDY;
1092			/* Set "interrupt pending" (IIR low bit cleared) */
1093			com1_regs.iir &= ~0x1;
1094		}
1095	} else {
1096		/*
1097		 * vei_dir == 1 : in instruction
1098		 *
1099		 * The guest read from the data register. Check to see if
1100		 * there is data available (RXRDY) and if so, consume the
1101		 * input data and return to the guest. Also clear the
1102		 * interrupt info register regardless.
1103		 */
1104		if (com1_regs.lsr & LSR_RXRDY) {
1105			vei->vei.vei_data = com1_regs.data;
1106			com1_regs.data = 0x0;
1107			com1_regs.lsr &= ~LSR_RXRDY;
1108		} else {
1109			/* XXX should this be com1_regs.data or 0xff? */
1110			vei->vei.vei_data = com1_regs.data;
1111			fprintf(stderr, "guest reading com1 when not ready\n");
1112		}
1113
1114		/* Reading the data register always clears RXRDY from IIR */
1115		com1_regs.iir &= ~IIR_RXRDY;
1116
1117		/*
1118		 * Clear "interrupt pending" by setting IIR low bit to 1
1119		 * if no interrupt are pending
1120		 */
1121		if (com1_regs.iir == 0x0)
1122			com1_regs.iir = 0x1;
1123	}
1124}
1125
1126/*
1127 * vcpu_process_com_lcr
1128 *
1129 * Emulate in/out instructions to the com1 (ns8250) UART line control register
1130 *
1131 * Paramters:
1132 *  vei: vm exit information from vmm(4) containing information on the in/out
1133 *      instruction being performed
1134 */
1135void
1136vcpu_process_com_lcr(union vm_exit *vei)
1137{
1138	/*
1139	 * vei_dir == 0 : out instruction
1140	 *
1141	 * Write content to line control register
1142	 */
1143	if (vei->vei.vei_dir == 0) {
1144		com1_regs.lcr = (uint8_t)vei->vei.vei_data;
1145	} else {
1146		/*
1147		 * vei_dir == 1 : in instruction
1148		 *
1149		 * Read line control register
1150		 */
1151		vei->vei.vei_data = com1_regs.lcr;
1152	}
1153}
1154
1155/*
1156 * vcpu_process_com_iir
1157 *
1158 * Emulate in/out instructions to the com1 (ns8250) UART interrupt information
1159 * register. Note that writes to this register actually are to a different
1160 * register, the FCR (FIFO control register) that we don't emulate but still
1161 * consume the data provided.
1162 *
1163 * Parameters:
1164 *  vei: vm exit information from vmm(4) containing information on the in/out
1165 *      instruction being performed
1166 */
1167void
1168vcpu_process_com_iir(union vm_exit *vei)
1169{
1170	/*
1171	 * vei_dir == 0 : out instruction
1172	 *
1173	 * Write to FCR
1174	 */
1175	if (vei->vei.vei_dir == 0) {
1176		com1_regs.fcr = vei->vei.vei_data;
1177	} else {
1178		/*
1179		 * vei_dir == 1 : in instruction
1180		 *
1181		 * Read IIR. Reading the IIR resets the TXRDY bit in the IIR
1182		 * after the data is read.
1183		 */
1184		vei->vei.vei_data = com1_regs.iir;
1185		com1_regs.iir &= ~IIR_TXRDY;
1186
1187		/*
1188		 * Clear "interrupt pending" by setting IIR low bit to 1
1189		 * if no interrupts are pending
1190		 */
1191		if (com1_regs.iir == 0x0)
1192			com1_regs.iir = 0x1;
1193	}
1194}
1195
1196/*
1197 * vcpu_process_com_mcr
1198 *
1199 * Emulate in/out instructions to the com1 (ns8250) UART modem control
1200 * register.
1201 *
1202 * Parameters:
1203 *  vei: vm exit information from vmm(4) containing information on the in/out
1204 *      instruction being performed
1205 */
1206void
1207vcpu_process_com_mcr(union vm_exit *vei)
1208{
1209	/*
1210	 * vei_dir == 0 : out instruction
1211	 *
1212	 * Write to MCR
1213	 */
1214	if (vei->vei.vei_dir == 0) {
1215		com1_regs.mcr = vei->vei.vei_data;
1216	} else {
1217		/*
1218		 * vei_dir == 1 : in instruction
1219		 *
1220		 * Read from MCR
1221		 */
1222		vei->vei.vei_data = com1_regs.mcr;
1223	}
1224}
1225
1226/*
1227 * vcpu_process_com_lsr
1228 *
1229 * Emulate in/out instructions to the com1 (ns8250) UART line status register.
1230 *
1231 * Parameters:
1232 *  vei: vm exit information from vmm(4) containing information on the in/out
1233 *      instruction being performed
1234 */
1235void
1236vcpu_process_com_lsr(union vm_exit *vei)
1237{
1238	/*
1239	 * vei_dir == 0 : out instruction
1240	 *
1241	 * Write to LSR. This is an illegal operation, so we just log it and
1242	 * continue.
1243	 */
1244	if (vei->vei.vei_dir == 0) {
1245		fprintf(stderr, "%s: LSR UART write 0x%x unsupported\n",
1246		    __progname, vei->vei.vei_data);
1247	} else {
1248		/*
1249		 * vei_dir == 1 : in instruction
1250		 *
1251		 * Read from LSR. We always report TXRDY and TSRE since we
1252		 * can process output characters immediately (at any time).
1253		 */
1254		vei->vei.vei_data = com1_regs.lsr | LSR_TSRE | LSR_TXRDY;
1255	}
1256}
1257
1258/*
1259 * vcpu_process_com_msr
1260 *
1261 * Emulate in/out instructions to the com1 (ns8250) UART modem status register.
1262 *
1263 * Parameters:
1264 *  vei: vm exit information from vmm(4) containing information on the in/out
1265 *      instruction being performed
1266 */
1267void
1268vcpu_process_com_msr(union vm_exit *vei)
1269{
1270	/*
1271	 * vei_dir == 0 : out instruction
1272	 *
1273	 * Write to MSR. This is an illegal operation, so we just log it and
1274	 * continue.
1275	 */
1276	if (vei->vei.vei_dir == 0) {
1277		fprintf(stderr, "%s: MSR UART write 0x%x unsupported\n",
1278		    __progname, vei->vei.vei_data);
1279	} else {
1280		/*
1281		 * vei_dir == 1 : in instruction
1282		 *
1283		 * Read from MSR. We always report DCD, DSR, and CTS.
1284		 */
1285		vei->vei.vei_data = com1_regs.lsr | MSR_DCD | MSR_DSR | MSR_CTS;
1286	}
1287}
1288
1289/*
1290 * vcpu_process_com_scr
1291 *
1292 * Emulate in/out instructions to the com1 (ns8250) UART scratch register. The
1293 * scratch register is sometimes used to distinguish an 8250 from a 16450,
1294 * and/or used to distinguish submodels of the 8250 (eg 8250A, 8250B). We
1295 * simulate an "original" 8250 by forcing the scratch register to return data
1296 * on read that is different from what was written.
1297 *
1298 * Parameters:
1299 *  vei: vm exit information from vmm(4) containing information on the in/out
1300 *      instruction being performed
1301 */
1302void
1303vcpu_process_com_scr(union vm_exit *vei)
1304{
1305	/*
1306	 * vei_dir == 0 : out instruction
1307	 *
1308	 * Write to SCR
1309	 */
1310	if (vei->vei.vei_dir == 0) {
1311		com1_regs.scr = vei->vei.vei_data;
1312	} else {
1313		/*
1314		 * vei_dir == 1 : in instruction
1315		 *
1316		 * Read from SCR. To make sure we don't accidentally simulate
1317		 * a real scratch register, we negate what was written on
1318		 * subsequent readback.
1319		 */
1320		vei->vei.vei_data = ~com1_regs.scr;
1321	}
1322}
1323
1324/*
1325 * vcpu_process_com_ier
1326 *
1327 * Emulate in/out instructions to the com1 (ns8250) UART interrupt enable
1328 * register.
1329 *
1330 * Parameters:
1331 *  vei: vm exit information from vmm(4) containing information on the in/out
1332 *      instruction being performed
1333 */
1334void
1335vcpu_process_com_ier(union vm_exit *vei)
1336{
1337	/*
1338	 * vei_dir == 0 : out instruction
1339	 *
1340	 * Write to IER
1341	 */
1342	if (vei->vei.vei_dir == 0) {
1343		com1_regs.ier = vei->vei.vei_data;
1344	} else {
1345		/*
1346		 * vei_dir == 1 : in instruction
1347		 *
1348		 * Read from IER
1349		 */
1350		vei->vei.vei_data = com1_regs.ier;
1351	}
1352}
1353
1354/*
1355 * vcpu_exit_com
1356 *
1357 * Process com1 (ns8250) UART exits. vmd handles most basic 8250
1358 * features with the exception of the divisor latch (eg, no baud
1359 * rate support)
1360 *
1361 * Parameters:
1362 *  vrp: vcpu run parameters containing guest state for this exit
1363 */
1364void
1365vcpu_exit_com(struct vm_run_params *vrp)
1366{
1367	union vm_exit *vei = vrp->vrp_exit;
1368
1369	switch(vei->vei.vei_port) {
1370	case COM1_LCR:
1371		vcpu_process_com_lcr(vei);
1372		break;
1373	case COM1_IER:
1374		vcpu_process_com_ier(vei);
1375		break;
1376	case COM1_IIR:
1377		vcpu_process_com_iir(vei);
1378		break;
1379	case COM1_MCR:
1380		vcpu_process_com_mcr(vei);
1381		break;
1382	case COM1_LSR:
1383		vcpu_process_com_lsr(vei);
1384		break;
1385	case COM1_MSR:
1386		vcpu_process_com_msr(vei);
1387		break;
1388	case COM1_SCR:
1389		vcpu_process_com_scr(vei);
1390		break;
1391	case COM1_DATA:
1392		vcpu_process_com_data(vei);
1393		break;
1394	}
1395}
1396
1397/*
1398 * vcpu_exit_pci
1399 *
1400 * Handle all I/O to the emulated PCI subsystem.
1401 *
1402 * Parameters:
1403 *  vrp: vcpu run paramters containing guest state for this exit
1404 *
1405 * Return values:
1406 *  0xff if no interrupt is required after this pci exit,
1407 *      or an interrupt vector otherwise
1408 */
1409uint8_t
1410vcpu_exit_pci(struct vm_run_params *vrp)
1411{
1412	union vm_exit *vei = vrp->vrp_exit;
1413	uint8_t intr;
1414
1415	intr = 0xFF;
1416
1417	switch(vei->vei.vei_port) {
1418	case PCI_MODE1_ADDRESS_REG:
1419		pci_handle_address_reg(vrp);
1420		break;
1421	case PCI_MODE1_DATA_REG:
1422		pci_handle_data_reg(vrp);
1423		break;
1424	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
1425		intr = pci_handle_io(vrp);
1426		break;
1427	default:
1428		fprintf(stderr, "%s: unknown PCI register 0x%llx\n",
1429		    __progname, (uint64_t)vei->vei.vei_port);
1430		break;
1431	}
1432
1433	return (intr);
1434}
1435
1436/*
1437 * vcpu_exit_inout
1438 *
1439 * Handle all I/O exits that need to be emulated in vmd. This includes the
1440 * i8253 PIT and the com1 ns8250 UART.
1441 *
1442 * Parameters:
1443 *  vrp: vcpu run parameters containing guest state for this exit
1444 */
1445void
1446vcpu_exit_inout(struct vm_run_params *vrp)
1447{
1448	union vm_exit *vei = vrp->vrp_exit;
1449	uint8_t intr;
1450
1451	switch(vei->vei.vei_port) {
1452	case TIMER_CTRL:
1453	case (TIMER_CNTR0 + TIMER_BASE):
1454	case (TIMER_CNTR1 + TIMER_BASE):
1455	case (TIMER_CNTR2 + TIMER_BASE):
1456		vcpu_exit_i8253(vei);
1457		break;
1458	case COM1_DATA ... COM1_SCR:
1459		vcpu_exit_com(vrp);
1460		break;
1461	case PCI_MODE1_ADDRESS_REG:
1462	case PCI_MODE1_DATA_REG:
1463	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
1464		intr = vcpu_exit_pci(vrp);
1465		if (intr != 0xFF)
1466			vrp->vrp_injint = intr;
1467		else
1468			vrp->vrp_injint = -1;
1469		break;
1470	default:
1471		/* IN from unsupported port gives FFs */
1472		if (vei->vei.vei_dir == 1)
1473			vei->vei.vei_data = 0xFFFFFFFF;
1474		break;
1475	}
1476}
1477
1478/*
1479 * vcpu_exit
1480 *
1481 * Handle a vcpu exit. This function is called when it is determined that
1482 * vmm(4) requires the assistance of vmd to support a particular guest
1483 * exit type (eg, accessing an I/O port or device). Guest state is contained
1484 * in 'vrp', and will be resent to vmm(4) on exit completion.
1485 *
1486 * Upon conclusion of handling the exit, the function determines if any
1487 * interrupts should be injected into the guest, and sets vrp->vrp_injint
1488 * to the IRQ line whose interrupt should be vectored (or -1 if no interrupt
1489 * is to be injected).
1490 *
1491 * Parameters:
1492 *  vrp: vcpu run parameters containing guest state for this exit
1493 *
1494 * Return values:
1495 *  0: the exit was handled successfully
1496 *  1: an error occurred (exit not handled)
1497 */
1498int
1499vcpu_exit(struct vm_run_params *vrp)
1500{
1501	ssize_t sz;
1502	char ch;
1503
1504	switch (vrp->vrp_exit_reason) {
1505	case VMX_EXIT_IO:
1506		vcpu_exit_inout(vrp);
1507		break;
1508	case VMX_EXIT_HLT:
1509		/*
1510		 * XXX handle halted state, no reason to run this vcpu again
1511		 * until a vm interrupt is to be injected
1512		 */
1513		break;
1514	default:
1515		fprintf(stderr, "%s: unknown exit reason %d\n",
1516		    __progname, vrp->vrp_exit_reason);
1517		return (1);
1518	}
1519
1520	/* XXX interrupt priority */
1521	if (vionet_process_rx())
1522		vrp->vrp_injint = 9;
1523
1524	/*
1525	 * Is there a new character available on com1?
1526	 * If so, consume the character, buffer it into the com1 data register
1527	 * assert IRQ4, and set the line status register RXRDY bit.
1528	 *
1529	 * XXX - move all this com intr checking to another function
1530	 */
1531	sz = read(con_fd, &ch, sizeof(char));
1532	if (sz == 1) {
1533		com1_regs.lsr |= LSR_RXRDY;
1534		com1_regs.data = ch;
1535		/* XXX these ier and iir bits should be IER_x and IIR_x */
1536		if (com1_regs.ier & 0x1) {
1537			com1_regs.iir |= (2 << 1);
1538			com1_regs.iir &= ~0x1;
1539		}
1540	}
1541
1542	/*
1543	 * Clear "interrupt pending" by setting IIR low bit to 1 if no
1544	 * interrupts are pending
1545	 */
1546	/* XXX these iir magic numbers should be IIR_x */
1547	if ((com1_regs.iir & ~0x1) == 0x0)
1548		com1_regs.iir = 0x1;
1549
1550	/* If pending interrupt and nothing waiting to be injected, inject */
1551	if ((com1_regs.iir & 0x1) == 0)
1552		if (vrp->vrp_injint == -1)
1553			vrp->vrp_injint = 0x4;
1554	vrp->vrp_continue = 1;
1555
1556	return (0);
1557}
1558
1559/*
1560 * write_page
1561 *
1562 * Pushes a page of data from 'buf' into the guest VM's memory
1563 * at paddr 'dst'.
1564 *
1565 * Parameters:
1566 *  dst: the destination paddr_t in the guest VM to push into.
1567 *      If there is no guest paddr mapping at 'dst', a new page will be
1568 *      faulted in by the VMM (provided 'dst' represents a valid paddr
1569 *      in the guest's address space)
1570 *  buf: page of data to push
1571 *  len: size of 'buf'
1572 *  do_mask: 1 to mask the destination address (for kernel load), 0 to
1573 *      leave 'dst' unmasked
1574 *
1575 * Return values:
1576 *  various return values from ioctl(VMM_IOC_WRITEPAGE), or 0 if no error
1577 *      occurred.
1578 *
1579 * Note - this function only handles GPAs < 4GB.
1580 */
1581int
1582write_page(uint32_t dst, void *buf, uint32_t len, int do_mask)
1583{
1584	int ret;
1585	struct vm_writepage_params vwp;
1586
1587	/*
1588	 * Mask kernel load addresses to avoid uint32_t -> uint64_t cast
1589	 * errors
1590	 */
1591	if (do_mask)
1592		dst &= 0xFFFFFFF;
1593
1594	vwp.vwp_paddr = (paddr_t)dst;
1595	vwp.vwp_data = buf;
1596	vwp.vwp_vm_id = vm_id;
1597	vwp.vwp_len = len;
1598	if (ioctl(vmm_fd, VMM_IOC_WRITEPAGE, &vwp) < 0) {
1599		ret = errno;
1600		fprintf(stderr, "writepage ioctl failed: %d\n", ret);
1601		return (ret);
1602	}
1603	return (0);
1604}
1605
1606/*
1607 * read_page
1608 *
1609 * Reads a page of memory at guest paddr 'src' into 'buf'.
1610 *
1611 * Parameters:
1612 *  src: the source paddr_t in the guest VM to read from.
1613 *  buf: destination (local) buffer
1614 *  len: size of 'buf'
1615 *  do_mask: 1 to mask the source address (for kernel load), 0 to
1616 *      leave 'src' unmasked
1617 *
1618 * Return values:
1619 *  various return values from ioctl(VMM_IOC_READPAGE), or 0 if no error
1620 *      occurred.
1621 *
1622 * Note - this function only handles GPAs < 4GB.
1623 */
1624int
1625read_page(uint32_t src, void *buf, uint32_t len, int do_mask)
1626{
1627	int ret;
1628	struct vm_readpage_params vrp;
1629
1630	/*
1631	 * Mask kernel load addresses to avoid uint32_t -> uint64_t cast
1632	 * errors
1633	 */
1634	if (do_mask)
1635		src &= 0xFFFFFFF;
1636
1637	vrp.vrp_paddr = (paddr_t)src;
1638	vrp.vrp_data = buf;
1639	vrp.vrp_vm_id = vm_id;
1640	vrp.vrp_len = len;
1641	if (ioctl(vmm_fd, VMM_IOC_READPAGE, &vrp) < 0) {
1642		ret = errno;
1643		fprintf(stderr, "readpage ioctl failed: %d\n", ret);
1644		return (ret);
1645	}
1646	return (0);
1647}
1648