vmd.c revision 1.4
1/*	$OpenBSD: vmd.c,v 1.4 2015/11/23 13:04:49 reyk Exp $	*/
2
3/*
4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19/*
20 * vmd(8) - virtual machine daemon
21 */
22
23#include <sys/types.h>
24#include <sys/ioctl.h>
25#include <sys/queue.h>
26#include <sys/uio.h>
27#include <sys/socket.h>
28#include <sys/stat.h>
29#include <sys/un.h>
30#include <sys/wait.h>
31#include <sys/mman.h>
32#include <sys/time.h>
33
34#include <dev/ic/comreg.h>
35#include <dev/ic/i8253reg.h>
36#include <dev/isa/isareg.h>
37#include <dev/pci/pcireg.h>
38
39#include <machine/param.h>
40#include <machine/vmmvar.h>
41
42#include <errno.h>
43#include <fcntl.h>
44#include <imsg.h>
45#include <limits.h>
46#include <pthread.h>
47#include <pwd.h>
48#include <signal.h>
49#include <stddef.h>
50#include <stdio.h>
51#include <stdlib.h>
52#include <string.h>
53#include <syslog.h>
54#include <termios.h>
55#include <unistd.h>
56#include <util.h>
57
58#include "vmd.h"
59#include "loadfile.h"
60#include "pci.h"
61#include "virtio.h"
62
63#define NR_BACKLOG 5
64
65#define MAX_TAP 256
66
67/*
68 * Emulated 8250 UART
69 *
70 */
71#define COM1_DATA	0x3f8
72#define COM1_IER	0x3f9
73#define COM1_IIR	0x3fa
74#define COM1_LCR	0x3fb
75#define COM1_MCR	0x3fc
76#define COM1_LSR	0x3fd
77#define COM1_MSR	0x3fe
78#define COM1_SCR	0x3ff
79
80/*
81 * Emulated i8253 PIT (counter)
82 */
83#define TIMER_BASE	0x40
84#define TIMER_CTRL	0x43	/* 8253 Timer #1 */
85#define NS_PER_TICK (1000000000 / TIMER_FREQ)
86
87/* i8253 registers */
88struct i8253_counter {
89	struct timeval tv;	/* timer start time */
90	uint16_t start;		/* starting value */
91	uint16_t olatch;	/* output latch */
92	uint16_t ilatch;	/* input latch */
93	uint8_t last_r;		/* last read byte (MSB/LSB) */
94	uint8_t last_w;		/* last written byte (MSB/LSB) */
95};
96
97/* ns8250 UART registers */
98struct ns8250_regs {
99	uint8_t lcr;		/* Line Control Register */
100	uint8_t fcr;		/* FIFO Control Register */
101	uint8_t iir;		/* Interrupt ID Register */
102	uint8_t ier;		/* Interrupt Enable Register */
103	uint8_t divlo;		/* Baud rate divisor low byte */
104	uint8_t divhi;		/* Baud rate divisor high byte */
105	uint8_t msr;		/* Modem Status Register */
106	uint8_t lsr;		/* Line Status Register */
107	uint8_t mcr;		/* Modem Control Register */
108	uint8_t scr;		/* Scratch Register */
109	uint8_t data;		/* Unread input data */
110};
111
112struct i8253_counter i8253_counter[3];
113struct ns8250_regs com1_regs;
114
115__dead void usage(void);
116
117void sighdlr(int);
118int main(int, char **);
119int control_run(void);
120int disable_vmm(void);
121int enable_vmm(void);
122int start_vm(struct imsg *);
123int terminate_vm(struct imsg *);
124int get_info_vm(struct imsgbuf *);
125int start_client_vmd(void);
126int opentap(void);
127int run_vm(int *, int *, struct vm_create_params *);
128void *vcpu_run_loop(void *);
129int vcpu_exit(struct vm_run_params *);
130int vmm_create_vm(struct vm_create_params *);
131void init_emulated_hw(struct vm_create_params *, int *, int *);
132void vcpu_exit_inout(struct vm_run_params *);
133uint8_t vcpu_exit_pci(struct vm_run_params *);
134void vcpu_exit_i8253(union vm_exit *);
135void vcpu_exit_com(struct vm_run_params *);
136void vcpu_process_com_data(union vm_exit *);
137void vcpu_process_com_lcr(union vm_exit *);
138void vcpu_process_com_lsr(union vm_exit *);
139void vcpu_process_com_ier(union vm_exit *);
140void vcpu_process_com_mcr(union vm_exit *);
141void vcpu_process_com_iir(union vm_exit *);
142void vcpu_process_com_msr(union vm_exit *);
143void vcpu_process_com_scr(union vm_exit *);
144
145int vmm_fd, con_fd, vm_id;
146volatile sig_atomic_t quit;
147
148SLIST_HEAD(vmstate_head, vmstate);
149struct vmstate_head vmstate;
150
151extern char *__progname;
152
153/*
154 * sighdlr
155 *
156 * Signal handler for TERM/INT/CHLD signals used during daemon shutdown
157 *
158 * Parameters:
159 *  sig: signal caught
160 */
161void
162sighdlr(int sig)
163{
164	switch (sig) {
165	case SIGTERM:
166	case SIGINT:
167		/* Tell main imsg loop to exit */
168		quit = 1;
169		break;
170	case SIGCHLD:
171		while (waitpid(WAIT_ANY, 0, WNOHANG) > 0) {}
172		break;
173	}
174}
175
176__dead void
177usage(void)
178{
179	extern char *__progname;
180	fprintf(stderr, "usage: %s [-dv]", __progname);
181	exit(1);
182}
183
184int
185main(int argc, char **argv)
186{
187	int debug = 0, verbose = 0, c, res;
188
189	while ((c = getopt(argc, argv, "dv")) != -1) {
190		switch (c) {
191		case 'd':
192			debug = 2;
193			break;
194		case 'v':
195			verbose++;
196			break;
197		default:
198			usage();
199		}
200	}
201
202	/* log to stderr until daemonized */
203	log_init(debug ? debug : 1, LOG_DAEMON);
204
205	/* Open /dev/vmm */
206	vmm_fd = open(VMM_NODE, O_RDONLY);
207	if (vmm_fd == -1)
208		fatal("can't open vmm device node %s", VMM_NODE);
209
210	setproctitle("control");
211
212	SLIST_INIT(&vmstate);
213
214	signal(SIGTERM, sighdlr);
215	signal(SIGINT, sighdlr);
216	signal(SIGCHLD, sighdlr);
217
218	log_init(debug, LOG_DAEMON);
219	log_verbose(verbose);
220	log_procinit("control");
221
222	if (!debug && daemon(1, 0) == -1)
223		fatal("can't daemonize");
224
225	res = control_run();
226
227	if (res == -1)
228		fatalx("control socket error");
229
230	return (0);
231}
232
233/*
234 * control_run
235 *
236 * Main control loop - establishes listening socket for incoming vmmctl(8)
237 * requests and dispatches appropriate calls to vmm(4). Replies to
238 * vmmctl(8) using imsg.
239 *
240 * Return values:
241 *  0: normal exit (signal to quit received)
242 *  -1: abnormal exit (various causes)
243 */
244int
245control_run(void)
246{
247	struct sockaddr_un sun, c_sun;
248	socklen_t len;
249	int fd, connfd, n, res;
250	mode_t mode, old_umask;
251	char *socketpath;
252	struct imsgbuf *ibuf;
253	struct imsg imsg;
254
255	/* Establish and start listening on control socket */
256	socketpath = SOCKET_NAME;
257	if ((fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0)) == -1) {
258		log_warn("%s: socket error", __progname);
259		return (-1);
260	}
261
262	bzero(&sun, sizeof(sun));
263	sun.sun_family = AF_UNIX;
264	if (strlcpy(sun.sun_path, socketpath, sizeof(sun.sun_path)) >=
265	    sizeof(sun.sun_path)) {
266		log_warnx("%s: socket name too long", __progname);
267		close(fd);
268		return (-1);
269	}
270
271	if (unlink(socketpath) == -1)
272		if (errno != ENOENT) {
273			log_warn("%s: unlink of %s failed",
274			    __progname, socketpath);
275			close(fd);
276			return (-1);
277		}
278
279	old_umask = umask(S_IXUSR|S_IXGRP|S_IWOTH|S_IROTH|S_IXOTH);
280	mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP;
281
282	if (bind(fd, (struct sockaddr *)&sun, sizeof(sun)) == -1) {
283		log_warn("%s: control_init: bind of %s failed",
284		    __progname, socketpath);
285		close(fd);
286		umask(old_umask);
287		return (-1);
288	}
289
290	umask(old_umask);
291
292	if (chmod(socketpath, mode) == -1) {
293		log_warn("%s: control_init: chmod of %s failed",
294		    __progname, socketpath);
295		close(fd);
296		unlink(socketpath);
297		return (-1);
298	}
299
300	if ((ibuf = malloc(sizeof(struct imsgbuf))) == NULL) {
301		log_warn("%s: out of memory", __progname);
302		close(fd);
303		unlink(socketpath);
304		return (-1);
305	}
306
307	if (listen(fd, NR_BACKLOG) == -1) {
308		log_warn("%s: listen failed", __progname);
309		close(fd);
310		unlink(socketpath);
311		return (-1);
312	}
313
314	while (!quit) {
315		if ((connfd = accept4(fd, (struct sockaddr *)&c_sun, &len,
316		    SOCK_CLOEXEC)) == -1) {
317			log_warn("%s: accept4 error", __progname);
318			close(fd);
319			unlink(socketpath);
320			return (-1);
321		}
322
323		imsg_init(ibuf, connfd);
324		if ((n = imsg_read(ibuf)) == -1 || n == 0) {
325			log_warnx("%s: imsg_read error, n=%d",
326			    __progname, n);
327			continue;
328		}
329
330		for (;;) {
331			if ((n = imsg_get(ibuf, &imsg)) == -1)
332				return (-1);
333
334			if (n == 0)
335				break;
336
337			/* Process incoming message (from vmmctl(8)) */
338			switch (imsg.hdr.type) {
339			case IMSG_VMDOP_DISABLE_VMM_REQUEST:
340				res = disable_vmm();
341				imsg_compose(ibuf,
342				    IMSG_VMDOP_DISABLE_VMM_RESPONSE, 0, 0, -1,
343				    &res, sizeof(res));
344				break;
345			case IMSG_VMDOP_ENABLE_VMM_REQUEST:
346				res = enable_vmm();
347				imsg_compose(ibuf,
348				    IMSG_VMDOP_ENABLE_VMM_RESPONSE, 0, 0, -1,
349				    &res, sizeof(res));
350				break;
351			case IMSG_VMDOP_START_VM_REQUEST:
352				res = start_vm(&imsg);
353				imsg_compose(ibuf,
354				    IMSG_VMDOP_START_VM_RESPONSE, 0, 0, -1,
355				    &res, sizeof(res));
356				break;
357			case IMSG_VMDOP_TERMINATE_VM_REQUEST:
358				res = terminate_vm(&imsg);
359				imsg_compose(ibuf,
360				    IMSG_VMDOP_TERMINATE_VM_RESPONSE, 0, 0, -1,
361				    &res, sizeof(res));
362				break;
363			case IMSG_VMDOP_GET_INFO_VM_REQUEST:
364				res = get_info_vm(ibuf);
365				imsg_compose(ibuf,
366				    IMSG_VMDOP_GET_INFO_VM_END_DATA, 0, 0, -1,
367				    &res, sizeof(res));
368				break;
369			}
370
371			while (ibuf->w.queued)
372				if (msgbuf_write(&ibuf->w) <= 0 && errno !=
373				    EAGAIN) {
374					log_warn("%s: msgbuf_write error",
375					    __progname);
376					close(fd);
377					close(connfd);
378					unlink(socketpath);
379					return (-1);
380				}
381			imsg_free(&imsg);
382		}
383		close(connfd);
384	}
385
386	signal(SIGCHLD, SIG_IGN);
387
388	return (0);
389}
390
391/*
392 * disable_vmm
393 *
394 * Disables VMM mode on all CPUs
395 *
396 * Return values:
397 *  0: success
398 *  !0 : ioctl to vmm(4) failed
399 */
400int
401disable_vmm(void)
402{
403	if (ioctl(vmm_fd, VMM_IOC_STOP, NULL) < 0)
404		return (errno);
405
406	return (0);
407}
408
409/*
410 * enable_vmm
411 *
412 * Enables VMM mode on all CPUs
413 *
414 * Return values:
415 *  0: success
416 *  !0 : ioctl to vmm(4) failed
417 */
418int
419enable_vmm(void)
420{
421	if (ioctl(vmm_fd, VMM_IOC_START, NULL) < 0)
422		return (errno);
423
424	return (0);
425}
426
427/*
428 * terminate_vm
429 *
430 * Requests vmm(4) to terminate the VM whose ID is provided in the
431 * supplied vm_terminate_params structure (vtp->vtp_vm_id)
432 *
433 * Parameters
434 *  imsg: The incoming imsg body whose 'data' field contains the
435 *      vm_terminate_params struct
436 *
437 * Return values:
438 *  0: success
439 *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM is not
440 *      valid)
441 */
442int
443terminate_vm(struct imsg *imsg)
444{
445	struct vm_terminate_params *vtp;
446
447	vtp = (struct vm_terminate_params *)imsg->data;
448
449	if (ioctl(vmm_fd, VMM_IOC_TERM, vtp) < 0)
450                return (errno);
451
452	return (0);
453}
454
455/*
456 * opentap
457 *
458 * Opens the next available tap device, up to MAX_TAP.
459 *
460 * Returns a file descriptor to the tap node opened, or -1 if no tap
461 * devices were available.
462 */
463int
464opentap(void)
465{
466	int i, fd;
467	char path[PATH_MAX];
468
469	for (i = 0; i < MAX_TAP; i++) {
470		snprintf(path, PATH_MAX, "/dev/tap%d", i);
471		fd = open(path, O_RDWR | O_NONBLOCK);
472		if (fd != -1)
473			return (fd);
474	}
475
476	return (-1);
477}
478
479/*
480 * start_vm
481 *
482 * Starts a new VM with the creation parameters supplied (in the incoming
483 * imsg->data field). This function performs a basic sanity check on the
484 * incoming parameters and then performs the following steps to complete
485 * the creation of the VM:
486 *
487 * 1. opens the VM disk image files specified in the VM creation parameters
488 * 2. opens the specified VM kernel
489 * 3. creates a VM console tty pair using openpty
490 * 4. forks, passing the file descriptors opened in steps 1-3 to the child
491 *     vmd responsible for dropping privilege and running the VM's VCPU
492 *     loops.
493 *
494 * Parameters:
495 *  imsg: The incoming imsg body whose 'data' field is a vm_create_params
496 *      struct containing the VM creation parameters.
497 *
498 * Return values:
499 *  0: success
500 *  !0 : failure - typically an errno indicating the source of the failure
501 */
502int
503start_vm(struct imsg *imsg)
504{
505	struct vm_create_params *vcp;
506	size_t i;
507	off_t kernel_size;
508	struct stat sb;
509	int child_disks[VMM_MAX_DISKS_PER_VM], kernel_fd, ret, ttym_fd;
510	int child_taps[VMM_MAX_NICS_PER_VM];
511	int ttys_fd;
512	char ptyn[32];
513
514	vcp = (struct vm_create_params *)imsg->data;
515
516	for (i = 0 ; i < VMM_MAX_DISKS_PER_VM; i++)
517		child_disks[i] = -1;
518	for (i = 0 ; i < VMM_MAX_NICS_PER_VM; i++)
519		child_taps[i] = -1;
520
521	/*
522	 * XXX kernel_fd can't be global (possible race if multiple VMs
523	 * being created at the same time). Probably need to move this
524	 * into the child before dropping privs, or just make it local
525	 * to this function?
526	 */
527	kernel_fd = -1;
528
529	ttym_fd = -1;
530	ttys_fd = -1;
531
532	/* Open disk images for child */
533	for (i = 0 ; i < vcp->vcp_ndisks; i++) {
534		child_disks[i] = open(vcp->vcp_disks[i], O_RDWR);
535		if (child_disks[i] == -1) {
536			ret = errno;
537			log_warn("%s: can't open %s", __progname,
538			    vcp->vcp_disks[i]);
539			goto err;
540		}
541	}
542
543	bzero(&sb, sizeof(sb));
544	if (stat(vcp->vcp_kernel, &sb) == -1) {
545		ret = errno;
546		log_warn("%s: can't stat kernel image %s",
547		    __progname, vcp->vcp_kernel);
548		goto err;
549	}
550
551	kernel_size = sb.st_size;
552
553	/* Open kernel image */
554	kernel_fd = open(vcp->vcp_kernel, O_RDONLY);
555	if (kernel_fd == -1) {
556		ret = errno;
557		log_warn("%s: can't open kernel image %s",
558		    __progname, vcp->vcp_kernel);
559		goto err;
560	}
561
562	if (openpty(&ttym_fd, &ttys_fd, ptyn, NULL, NULL) == -1) {
563		ret = errno;
564		log_warn("%s: openpty failed", __progname);
565		goto err;
566	}
567
568	if (close(ttys_fd)) {
569		ret = errno;
570		log_warn("%s: close tty failed", __progname);
571		goto err;
572	}
573
574	/* Open tap devices for child */
575	for (i = 0 ; i < vcp->vcp_nnics; i++) {
576		child_taps[i] = opentap();
577		if (child_taps[i] == -1) {
578			ret = errno;
579			log_warn("%s: can't open tap for nic %zd",
580			    __progname, i);
581			goto err;
582		}
583	}
584
585	/* Start child vmd for this VM (fork, chroot, drop privs) */
586	ret = start_client_vmd();
587
588	/* Start child failed? - cleanup and leave */
589	if (ret == -1) {
590		ret = EIO;
591		goto err;
592	}
593
594	if (ret > 0) {
595		/* Parent */
596		for (i = 0 ; i < vcp->vcp_ndisks; i++)
597			close(child_disks[i]);
598
599		for (i = 0 ; i < vcp->vcp_nnics; i++)
600			close(child_taps[i]);
601
602		close(kernel_fd);
603		close(ttym_fd);
604
605		return (0);
606	}
607	else {
608		/* Child */
609		setproctitle(vcp->vcp_name);
610		log_procinit(vcp->vcp_name);
611
612		log_info("%s: vm console: %s", __progname, ptyn);
613		ret = vmm_create_vm(vcp);
614		if (ret) {
615			errno = ret;
616			fatal("create vmm ioctl failed - exiting");
617		}
618
619		/* Load kernel image */
620		ret = loadelf_main(kernel_fd, vcp->vcp_id, vcp->vcp_memory_size);
621		if (ret) {
622			errno = ret;
623			fatal("failed to load kernel - exiting");
624		}
625
626		close(kernel_fd);
627
628		con_fd = ttym_fd;
629		if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
630			fatal("failed to set nonblocking mode on console");
631
632		/* Execute the vcpu run loop(s) for this VM */
633		ret = run_vm(child_disks, child_taps, vcp);
634		_exit(ret != 0);
635	}
636
637	return (ret);
638
639err:
640	for (i = 0 ; i < vcp->vcp_ndisks; i++)
641		if (child_disks[i] != -1)
642			close(child_disks[i]);
643
644	for (i = 0 ; i < vcp->vcp_nnics; i++)
645		if (child_taps[i] != -1)
646			close(child_taps[i]);
647
648	if (kernel_fd != -1)
649		close(kernel_fd);
650
651	if (ttym_fd != -1)
652		close(ttym_fd);
653
654	return (ret);
655}
656
657/*
658 * get_info_vm
659 *
660 * Returns a list of VMs known to vmm(4).
661 *
662 * Parameters:
663 *  ibuf: the imsg ibuf in which to place the results. A new imsg will
664 *      be created using this ibuf.
665 *
666 * Return values:
667 *  0: success
668 *  !0 : failure (eg, ENOMEM, EIO or another error code from vmm(4) ioctl)
669 */
670int
671get_info_vm(struct imsgbuf *ibuf)
672{
673	int ret;
674	size_t ct, i;
675	struct ibuf *obuf;
676	struct vm_info_params vip;
677	struct vm_info_result *info;
678
679	/*
680	 * We issue the VMM_IOC_INFO ioctl twice, once with an input
681	 * buffer size of 0, which results in vmm(4) returning the
682	 * number of bytes required back to us in vip.vip_size,
683	 * and then we call it again after malloc'ing the required
684	 * number of bytes.
685	 *
686	 * It is possible that we could fail a second time (eg, if
687	 * another VM was created in the instant between the two
688	 * ioctls, but in that case the caller can just try again
689	 * as vmm(4) will return a zero-sized list in that case.
690	 */
691	vip.vip_size = 0;
692	info = NULL;
693	ret = 0;
694
695	/* First ioctl to see how many bytes needed (vip.vip_size) */
696	if (ioctl(vmm_fd, VMM_IOC_INFO, &vip) < 0)
697		return (errno);
698
699	if (vip.vip_info_ct != 0)
700		return (EIO);
701
702	info = malloc(vip.vip_size);
703	if (info == NULL)
704		return (ENOMEM);
705
706	/* Second ioctl to get the actual list */
707	vip.vip_info = info;
708	if (ioctl(vmm_fd, VMM_IOC_INFO, &vip) < 0) {
709		ret = errno;
710		free(info);
711		return (ret);
712	}
713
714	/* Return info to vmmctl(4) */
715	ct = vip.vip_size / sizeof(struct vm_info_result);
716	for (i = 0; i < ct; i++) {
717		obuf = imsg_create(ibuf, IMSG_VMDOP_GET_INFO_VM_DATA, 0, 0,
718		    sizeof(struct vm_info_result));
719		imsg_add(obuf, &info[i], sizeof(struct vm_info_result));
720		imsg_close(ibuf, obuf);
721	}
722	free(info);
723	return (0);
724}
725
726
727/*
728 * start_client_vmd
729 *
730 * forks a copy of the parent vmd, chroots to VMD_USER's home, drops
731 * privileges (changes to user VMD_USER), and returns.
732 * Should the fork operation succeed, but later chroot/privsep
733 * fail, the child exits.
734 *
735 * Return values (returns to both child and parent on success):
736 *  -1 : failure
737 *  0: return to child vmd returns 0
738 *  !0 : return to parent vmd returns the child's pid
739 */
740int
741start_client_vmd(void)
742{
743	int child_pid;
744	struct passwd *pw;
745
746	pw = getpwnam(VMD_USER);
747	if (pw == NULL) {
748		log_warnx("%s: no such user %s", __progname, VMD_USER);
749		return (-1);
750	}
751
752	child_pid = fork();
753	if (child_pid < 0)
754		return (-1);
755
756	if (!child_pid) {
757		/* Child */
758		if (chroot(pw->pw_dir) != 0)
759			fatal("unable to chroot");
760		if (chdir("/") != 0)
761			fatal("unable to chdir");
762
763		if (setgroups(1, &pw->pw_gid) == -1)
764			fatal("setgroups() failed");
765		if (setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) == -1)
766			fatal("setresgid() failed");
767		if (setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid) == -1)
768			fatal("setresuid() failed");
769
770		return (0);
771	}
772
773	/* Parent */
774	return (child_pid);
775}
776
777/*
778 * vmm_create_vm
779 *
780 * Requests vmm(4) to create a new VM using the supplied creation
781 * parameters. This operation results in the creation of the in-kernel
782 * structures for the VM, but does not start the VM's vcpu(s).
783 *
784 * Parameters:
785 *  vcp: vm_create_params struct containing the VM's desired creation
786 *      configuration
787 *
788 * Return values:
789 *  0: success
790 *  !0 : ioctl to vmm(4) failed
791 */
792int
793vmm_create_vm(struct vm_create_params *vcp)
794{
795	/* Sanity check arguments */
796	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
797		return (EINVAL);
798
799	if (vcp->vcp_memory_size > VMM_MAX_VM_MEM_SIZE)
800		return (EINVAL);
801
802	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
803		return (EINVAL);
804
805	if (ioctl(vmm_fd, VMM_IOC_CREATE, vcp) < 0)
806		return (errno);
807
808	return (0);
809}
810
811/*
812 * init_emulated_hw
813 *
814 * Initializes the userspace hardware emulation
815 */
816void
817init_emulated_hw(struct vm_create_params *vcp, int *child_disks,
818    int *child_taps)
819{
820	/* Init the i8253 PIT's 3 counters */
821	bzero(&i8253_counter, sizeof(struct i8253_counter) * 3);
822	gettimeofday(&i8253_counter[0].tv, NULL);
823	gettimeofday(&i8253_counter[1].tv, NULL);
824	gettimeofday(&i8253_counter[2].tv, NULL);
825	i8253_counter[0].start = TIMER_DIV(100);
826	i8253_counter[1].start = TIMER_DIV(100);
827	i8253_counter[2].start = TIMER_DIV(100);
828
829	/* Init ns8250 UART */
830	bzero(&com1_regs, sizeof(struct ns8250_regs));
831
832	/* Initialize PCI */
833	pci_init();
834
835	/* Initialize virtio devices */
836	virtio_init(vcp, child_disks, child_taps);
837}
838
839/*
840 * run_vm
841 *
842 * Runs the VM whose creation parameters are specified in vcp
843 *
844 * Parameters:
845 *  vcp: vm_create_params struct containing the VM's desired creation
846 *      configuration
847 *  child_disks: previously-opened child VM disk file file descriptors
848 *  child_taps: previously-opened child tap file descriptors
849 *
850 * Return values:
851 *  0: the VM exited normally
852 *  !0 : the VM exited abnormally or failed to start
853 */
854int
855run_vm(int *child_disks, int *child_taps, struct vm_create_params *vcp)
856{
857	size_t i;
858	int ret;
859	pthread_t *tid;
860	void *exit_status;
861	struct vm_run_params **vrp;
862
863	ret = 0;
864
865	/* XXX cap vcp_ncpus to avoid overflow here */
866	/*
867	 * XXX ensure nvcpus in vcp is same as vm, or fix vmm to return einval
868	 * on bad vcpu id
869	 */
870	tid = malloc(sizeof(pthread_t) * vcp->vcp_ncpus);
871	vrp = malloc(sizeof(struct vm_run_params *) * vcp->vcp_ncpus);
872	if (tid == NULL || vrp == NULL) {
873		log_warn("%s: memory allocation error - exiting.",
874		    __progname);
875		return (ENOMEM);
876	}
877
878	init_emulated_hw(vcp, child_disks, child_taps);
879
880	/*
881	 * Create and launch one thread for each VCPU. These threads may
882	 * migrate between PCPUs over time; the need to reload CPU state
883	 * in such situations is detected and performed by vmm(4) in the
884	 * kernel.
885	 */
886	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
887		vrp[i] = malloc(sizeof(struct vm_run_params));
888		if (vrp[i] == NULL) {
889			log_warn("%s: memory allocation error - "
890			    "exiting.", __progname);
891			/* caller will exit, so skip free'ing */
892			return (ENOMEM);
893		}
894		vrp[i]->vrp_exit = malloc(sizeof(union vm_exit));
895		if (vrp[i]->vrp_exit == NULL) {
896			log_warn("%s: memory allocation error - "
897			    "exiting.", __progname);
898			/* caller will exit, so skip free'ing */
899			return (ENOMEM);
900		}
901		vrp[i]->vrp_vm_id = vcp->vcp_id;
902		vrp[i]->vrp_vcpu_id = i;
903
904		/* Start each VCPU run thread at vcpu_run_loop */
905		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
906		if (ret) {
907			/* caller will _exit after this return */
908			return (ret);
909		}
910	}
911
912	/* Wait for all the threads to exit */
913	for (i = 0; i < vcp->vcp_ncpus; i++) {
914		if (pthread_join(tid[i], &exit_status)) {
915			log_warn("%s: failed to join thread %zd - "
916			    "exiting", __progname, i);
917			return (EIO);
918		}
919
920		if (exit_status != NULL) {
921			log_warnx("%s: vm %d vcpu run thread %zd exited "
922			    "abnormally", __progname, vcp->vcp_id, i);
923			ret = EIO;
924		}
925	}
926
927	return (ret);
928}
929
930/*
931 * vcpu_run_loop
932 *
933 * Runs a single VCPU until vmm(4) requires help handling an exit,
934 * or the VM terminates.
935 *
936 * Parameters:
937 *  arg: vcpu_run_params for the VCPU being run by this thread
938 *
939 * Return values:
940 *  NULL: the VCPU shutdown properly
941 *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
942 */
943void *
944vcpu_run_loop(void *arg)
945{
946	struct vm_run_params *vrp = (struct vm_run_params *)arg;
947	intptr_t ret;
948
949	vrp->vrp_continue = 0;
950	vrp->vrp_injint = -1;
951
952	for (;;) {
953		if (ioctl(vmm_fd, VMM_IOC_RUN, vrp) < 0) {
954			/* If run ioctl failed, exit */
955			ret = errno;
956			return ((void *)ret);
957		}
958
959		/* If the VM is terminating, exit normally */
960		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED)
961			return (NULL);
962
963		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
964			/*
965			 * vmm(4) needs help handling an exit, handle in
966			 * vcpu_exit.
967			 */
968			if (vcpu_exit(vrp))
969				return ((void *)EIO);
970		}
971	}
972
973	return (NULL);
974}
975
976/*
977 * vcpu_exit_i8253
978 *
979 * Handles emulated i8253 PIT access (in/out instruction to PIT ports).
980 * We don't emulate all the modes of the i8253, just the basic squarewave
981 * clock.
982 *
983 * Parameters:
984 *  vei: VM exit information from vmm(4) containing information on the in/out
985 *      instruction being performed
986 */
987void
988vcpu_exit_i8253(union vm_exit *vei)
989{
990	uint32_t out_data;
991	uint8_t sel, rw, data;
992	uint64_t ns, ticks;
993	struct timeval now, delta;
994
995	if (vei->vei.vei_port == TIMER_CTRL) {
996		if (vei->vei.vei_dir == 0) { /* OUT instruction */
997			out_data = vei->vei.vei_data;
998			sel = out_data &
999			    (TIMER_SEL0 | TIMER_SEL1 | TIMER_SEL2);
1000			sel = sel >> 6;
1001			if (sel > 2) {
1002				log_warnx("%s: i8253 PIT: invalid "
1003				    "timer selected (%d)",
1004				    __progname, sel);
1005				return;
1006			}
1007
1008			rw = vei->vei.vei_data &
1009			    (TIMER_LATCH | TIMER_LSB |
1010			    TIMER_MSB | TIMER_16BIT);
1011
1012			if (rw == TIMER_16BIT) {
1013				/*
1014				 * XXX this seems to be used on occasion, needs
1015				 * to be implemented
1016				 */
1017				log_warnx("%s: i8253 PIT: 16 bit "
1018				    "counter I/O not supported",
1019				    __progname);
1020				    return;
1021			}
1022
1023			/*
1024			 * Since we don't truly emulate each tick of the PIT
1025			 * clock, when the guest asks for the timer to be
1026			 * latched, simulate what the counter would have been
1027			 * had we performed full emulation. We do this by
1028			 * calculating when the counter was reset vs how much
1029			 * time has elapsed, then bias by the counter tick
1030			 * rate.
1031			 */
1032			if (rw == TIMER_LATCH) {
1033				gettimeofday(&now, NULL);
1034				delta.tv_sec = now.tv_sec -
1035				    i8253_counter[sel].tv.tv_sec;
1036				delta.tv_usec = now.tv_usec -
1037				    i8253_counter[sel].tv.tv_usec;
1038				if (delta.tv_usec < 0) {
1039					delta.tv_sec--;
1040					delta.tv_usec += 1000000;
1041				}
1042				if (delta.tv_usec > 1000000) {
1043					delta.tv_sec++;
1044					delta.tv_usec -= 1000000;
1045				}
1046				ns = delta.tv_usec * 1000 +
1047				    delta.tv_sec * 1000000000;
1048				ticks = ns / NS_PER_TICK;
1049				i8253_counter[sel].olatch =
1050				    i8253_counter[sel].start -
1051				    ticks % i8253_counter[sel].start;
1052				return;
1053			}
1054
1055			log_warnx("%s: i8253 PIT: unsupported rw mode "
1056			    "%d", __progname, rw);
1057			return;
1058		} else {
1059			/* XXX should this return 0xff? */
1060			log_warnx("%s: i8253 PIT: read from control "
1061			    "port unsupported", __progname);
1062		}
1063	} else {
1064		sel = vei->vei.vei_port - (TIMER_CNTR0 + TIMER_BASE);
1065		if (vei->vei.vei_dir == 0) { /* OUT instruction */
1066			if (i8253_counter[sel].last_w == 0) {
1067				out_data = vei->vei.vei_data;
1068				i8253_counter[sel].ilatch |= (out_data << 8);
1069				i8253_counter[sel].last_w = 1;
1070			} else {
1071				out_data = vei->vei.vei_data;
1072				i8253_counter[sel].ilatch |= out_data;
1073				i8253_counter[sel].start =
1074				    i8253_counter[sel].ilatch;
1075				i8253_counter[sel].last_w = 0;
1076			}
1077		} else {
1078			if (i8253_counter[sel].last_r == 0) {
1079				data = i8253_counter[sel].olatch >> 8;
1080				vei->vei.vei_data = data;
1081				i8253_counter[sel].last_w = 1;
1082			} else {
1083				data = i8253_counter[sel].olatch & 0xFF;
1084				vei->vei.vei_data = data;
1085				i8253_counter[sel].last_w = 0;
1086			}
1087		}
1088	}
1089}
1090
1091/*
1092 * vcpu_process_com_data
1093 *
1094 * Emulate in/out instructions to the com1 (ns8250) UART data register
1095 *
1096 * Parameters:
1097 *  vei: vm exit information from vmm(4) containing information on the in/out
1098 *      instruction being performed
1099 */
1100void
1101vcpu_process_com_data(union vm_exit *vei)
1102{
1103	/*
1104	 * vei_dir == 0 : out instruction
1105	 *
1106	 * The guest wrote to the data register. Since we are emulating a
1107	 * no-fifo chip, write the character immediately to the pty and
1108	 * assert TXRDY in IIR (if the guest has requested TXRDY interrupt
1109	 * reporting)
1110	 */
1111	if (vei->vei.vei_dir == 0) {
1112		write(con_fd, &vei->vei.vei_data, 1);
1113		if (com1_regs.ier & 0x2) {
1114			/* Set TXRDY */
1115			com1_regs.iir |= IIR_TXRDY;
1116			/* Set "interrupt pending" (IIR low bit cleared) */
1117			com1_regs.iir &= ~0x1;
1118		}
1119	} else {
1120		/*
1121		 * vei_dir == 1 : in instruction
1122		 *
1123		 * The guest read from the data register. Check to see if
1124		 * there is data available (RXRDY) and if so, consume the
1125		 * input data and return to the guest. Also clear the
1126		 * interrupt info register regardless.
1127		 */
1128		if (com1_regs.lsr & LSR_RXRDY) {
1129			vei->vei.vei_data = com1_regs.data;
1130			com1_regs.data = 0x0;
1131			com1_regs.lsr &= ~LSR_RXRDY;
1132		} else {
1133			/* XXX should this be com1_regs.data or 0xff? */
1134			vei->vei.vei_data = com1_regs.data;
1135			log_warnx("guest reading com1 when not ready");
1136		}
1137
1138		/* Reading the data register always clears RXRDY from IIR */
1139		com1_regs.iir &= ~IIR_RXRDY;
1140
1141		/*
1142		 * Clear "interrupt pending" by setting IIR low bit to 1
1143		 * if no interrupt are pending
1144		 */
1145		if (com1_regs.iir == 0x0)
1146			com1_regs.iir = 0x1;
1147	}
1148}
1149
1150/*
1151 * vcpu_process_com_lcr
1152 *
1153 * Emulate in/out instructions to the com1 (ns8250) UART line control register
1154 *
1155 * Paramters:
1156 *  vei: vm exit information from vmm(4) containing information on the in/out
1157 *      instruction being performed
1158 */
1159void
1160vcpu_process_com_lcr(union vm_exit *vei)
1161{
1162	/*
1163	 * vei_dir == 0 : out instruction
1164	 *
1165	 * Write content to line control register
1166	 */
1167	if (vei->vei.vei_dir == 0) {
1168		com1_regs.lcr = (uint8_t)vei->vei.vei_data;
1169	} else {
1170		/*
1171		 * vei_dir == 1 : in instruction
1172		 *
1173		 * Read line control register
1174		 */
1175		vei->vei.vei_data = com1_regs.lcr;
1176	}
1177}
1178
1179/*
1180 * vcpu_process_com_iir
1181 *
1182 * Emulate in/out instructions to the com1 (ns8250) UART interrupt information
1183 * register. Note that writes to this register actually are to a different
1184 * register, the FCR (FIFO control register) that we don't emulate but still
1185 * consume the data provided.
1186 *
1187 * Parameters:
1188 *  vei: vm exit information from vmm(4) containing information on the in/out
1189 *      instruction being performed
1190 */
1191void
1192vcpu_process_com_iir(union vm_exit *vei)
1193{
1194	/*
1195	 * vei_dir == 0 : out instruction
1196	 *
1197	 * Write to FCR
1198	 */
1199	if (vei->vei.vei_dir == 0) {
1200		com1_regs.fcr = vei->vei.vei_data;
1201	} else {
1202		/*
1203		 * vei_dir == 1 : in instruction
1204		 *
1205		 * Read IIR. Reading the IIR resets the TXRDY bit in the IIR
1206		 * after the data is read.
1207		 */
1208		vei->vei.vei_data = com1_regs.iir;
1209		com1_regs.iir &= ~IIR_TXRDY;
1210
1211		/*
1212		 * Clear "interrupt pending" by setting IIR low bit to 1
1213		 * if no interrupts are pending
1214		 */
1215		if (com1_regs.iir == 0x0)
1216			com1_regs.iir = 0x1;
1217	}
1218}
1219
1220/*
1221 * vcpu_process_com_mcr
1222 *
1223 * Emulate in/out instructions to the com1 (ns8250) UART modem control
1224 * register.
1225 *
1226 * Parameters:
1227 *  vei: vm exit information from vmm(4) containing information on the in/out
1228 *      instruction being performed
1229 */
1230void
1231vcpu_process_com_mcr(union vm_exit *vei)
1232{
1233	/*
1234	 * vei_dir == 0 : out instruction
1235	 *
1236	 * Write to MCR
1237	 */
1238	if (vei->vei.vei_dir == 0) {
1239		com1_regs.mcr = vei->vei.vei_data;
1240	} else {
1241		/*
1242		 * vei_dir == 1 : in instruction
1243		 *
1244		 * Read from MCR
1245		 */
1246		vei->vei.vei_data = com1_regs.mcr;
1247	}
1248}
1249
1250/*
1251 * vcpu_process_com_lsr
1252 *
1253 * Emulate in/out instructions to the com1 (ns8250) UART line status register.
1254 *
1255 * Parameters:
1256 *  vei: vm exit information from vmm(4) containing information on the in/out
1257 *      instruction being performed
1258 */
1259void
1260vcpu_process_com_lsr(union vm_exit *vei)
1261{
1262	/*
1263	 * vei_dir == 0 : out instruction
1264	 *
1265	 * Write to LSR. This is an illegal operation, so we just log it and
1266	 * continue.
1267	 */
1268	if (vei->vei.vei_dir == 0) {
1269		log_warnx("%s: LSR UART write 0x%x unsupported",
1270		    __progname, vei->vei.vei_data);
1271	} else {
1272		/*
1273		 * vei_dir == 1 : in instruction
1274		 *
1275		 * Read from LSR. We always report TXRDY and TSRE since we
1276		 * can process output characters immediately (at any time).
1277		 */
1278		vei->vei.vei_data = com1_regs.lsr | LSR_TSRE | LSR_TXRDY;
1279	}
1280}
1281
1282/*
1283 * vcpu_process_com_msr
1284 *
1285 * Emulate in/out instructions to the com1 (ns8250) UART modem status register.
1286 *
1287 * Parameters:
1288 *  vei: vm exit information from vmm(4) containing information on the in/out
1289 *      instruction being performed
1290 */
1291void
1292vcpu_process_com_msr(union vm_exit *vei)
1293{
1294	/*
1295	 * vei_dir == 0 : out instruction
1296	 *
1297	 * Write to MSR. This is an illegal operation, so we just log it and
1298	 * continue.
1299	 */
1300	if (vei->vei.vei_dir == 0) {
1301		log_warnx("%s: MSR UART write 0x%x unsupported",
1302		    __progname, vei->vei.vei_data);
1303	} else {
1304		/*
1305		 * vei_dir == 1 : in instruction
1306		 *
1307		 * Read from MSR. We always report DCD, DSR, and CTS.
1308		 */
1309		vei->vei.vei_data = com1_regs.lsr | MSR_DCD | MSR_DSR | MSR_CTS;
1310	}
1311}
1312
1313/*
1314 * vcpu_process_com_scr
1315 *
1316 * Emulate in/out instructions to the com1 (ns8250) UART scratch register. The
1317 * scratch register is sometimes used to distinguish an 8250 from a 16450,
1318 * and/or used to distinguish submodels of the 8250 (eg 8250A, 8250B). We
1319 * simulate an "original" 8250 by forcing the scratch register to return data
1320 * on read that is different from what was written.
1321 *
1322 * Parameters:
1323 *  vei: vm exit information from vmm(4) containing information on the in/out
1324 *      instruction being performed
1325 */
1326void
1327vcpu_process_com_scr(union vm_exit *vei)
1328{
1329	/*
1330	 * vei_dir == 0 : out instruction
1331	 *
1332	 * Write to SCR
1333	 */
1334	if (vei->vei.vei_dir == 0) {
1335		com1_regs.scr = vei->vei.vei_data;
1336	} else {
1337		/*
1338		 * vei_dir == 1 : in instruction
1339		 *
1340		 * Read from SCR. To make sure we don't accidentally simulate
1341		 * a real scratch register, we negate what was written on
1342		 * subsequent readback.
1343		 */
1344		vei->vei.vei_data = ~com1_regs.scr;
1345	}
1346}
1347
1348/*
1349 * vcpu_process_com_ier
1350 *
1351 * Emulate in/out instructions to the com1 (ns8250) UART interrupt enable
1352 * register.
1353 *
1354 * Parameters:
1355 *  vei: vm exit information from vmm(4) containing information on the in/out
1356 *      instruction being performed
1357 */
1358void
1359vcpu_process_com_ier(union vm_exit *vei)
1360{
1361	/*
1362	 * vei_dir == 0 : out instruction
1363	 *
1364	 * Write to IER
1365	 */
1366	if (vei->vei.vei_dir == 0) {
1367		com1_regs.ier = vei->vei.vei_data;
1368	} else {
1369		/*
1370		 * vei_dir == 1 : in instruction
1371		 *
1372		 * Read from IER
1373		 */
1374		vei->vei.vei_data = com1_regs.ier;
1375	}
1376}
1377
1378/*
1379 * vcpu_exit_com
1380 *
1381 * Process com1 (ns8250) UART exits. vmd handles most basic 8250
1382 * features with the exception of the divisor latch (eg, no baud
1383 * rate support)
1384 *
1385 * Parameters:
1386 *  vrp: vcpu run parameters containing guest state for this exit
1387 */
1388void
1389vcpu_exit_com(struct vm_run_params *vrp)
1390{
1391	union vm_exit *vei = vrp->vrp_exit;
1392
1393	switch(vei->vei.vei_port) {
1394	case COM1_LCR:
1395		vcpu_process_com_lcr(vei);
1396		break;
1397	case COM1_IER:
1398		vcpu_process_com_ier(vei);
1399		break;
1400	case COM1_IIR:
1401		vcpu_process_com_iir(vei);
1402		break;
1403	case COM1_MCR:
1404		vcpu_process_com_mcr(vei);
1405		break;
1406	case COM1_LSR:
1407		vcpu_process_com_lsr(vei);
1408		break;
1409	case COM1_MSR:
1410		vcpu_process_com_msr(vei);
1411		break;
1412	case COM1_SCR:
1413		vcpu_process_com_scr(vei);
1414		break;
1415	case COM1_DATA:
1416		vcpu_process_com_data(vei);
1417		break;
1418	}
1419}
1420
1421/*
1422 * vcpu_exit_pci
1423 *
1424 * Handle all I/O to the emulated PCI subsystem.
1425 *
1426 * Parameters:
1427 *  vrp: vcpu run paramters containing guest state for this exit
1428 *
1429 * Return values:
1430 *  0xff if no interrupt is required after this pci exit,
1431 *      or an interrupt vector otherwise
1432 */
1433uint8_t
1434vcpu_exit_pci(struct vm_run_params *vrp)
1435{
1436	union vm_exit *vei = vrp->vrp_exit;
1437	uint8_t intr;
1438
1439	intr = 0xFF;
1440
1441	switch(vei->vei.vei_port) {
1442	case PCI_MODE1_ADDRESS_REG:
1443		pci_handle_address_reg(vrp);
1444		break;
1445	case PCI_MODE1_DATA_REG:
1446		pci_handle_data_reg(vrp);
1447		break;
1448	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
1449		intr = pci_handle_io(vrp);
1450		break;
1451	default:
1452		log_warnx("%s: unknown PCI register 0x%llx",
1453		    __progname, (uint64_t)vei->vei.vei_port);
1454		break;
1455	}
1456
1457	return (intr);
1458}
1459
1460/*
1461 * vcpu_exit_inout
1462 *
1463 * Handle all I/O exits that need to be emulated in vmd. This includes the
1464 * i8253 PIT and the com1 ns8250 UART.
1465 *
1466 * Parameters:
1467 *  vrp: vcpu run parameters containing guest state for this exit
1468 */
1469void
1470vcpu_exit_inout(struct vm_run_params *vrp)
1471{
1472	union vm_exit *vei = vrp->vrp_exit;
1473	uint8_t intr;
1474
1475	switch(vei->vei.vei_port) {
1476	case TIMER_CTRL:
1477	case (TIMER_CNTR0 + TIMER_BASE):
1478	case (TIMER_CNTR1 + TIMER_BASE):
1479	case (TIMER_CNTR2 + TIMER_BASE):
1480		vcpu_exit_i8253(vei);
1481		break;
1482	case COM1_DATA ... COM1_SCR:
1483		vcpu_exit_com(vrp);
1484		break;
1485	case PCI_MODE1_ADDRESS_REG:
1486	case PCI_MODE1_DATA_REG:
1487	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
1488		intr = vcpu_exit_pci(vrp);
1489		if (intr != 0xFF)
1490			vrp->vrp_injint = intr;
1491		else
1492			vrp->vrp_injint = -1;
1493		break;
1494	default:
1495		/* IN from unsupported port gives FFs */
1496		if (vei->vei.vei_dir == 1)
1497			vei->vei.vei_data = 0xFFFFFFFF;
1498		break;
1499	}
1500}
1501
1502/*
1503 * vcpu_exit
1504 *
1505 * Handle a vcpu exit. This function is called when it is determined that
1506 * vmm(4) requires the assistance of vmd to support a particular guest
1507 * exit type (eg, accessing an I/O port or device). Guest state is contained
1508 * in 'vrp', and will be resent to vmm(4) on exit completion.
1509 *
1510 * Upon conclusion of handling the exit, the function determines if any
1511 * interrupts should be injected into the guest, and sets vrp->vrp_injint
1512 * to the IRQ line whose interrupt should be vectored (or -1 if no interrupt
1513 * is to be injected).
1514 *
1515 * Parameters:
1516 *  vrp: vcpu run parameters containing guest state for this exit
1517 *
1518 * Return values:
1519 *  0: the exit was handled successfully
1520 *  1: an error occurred (exit not handled)
1521 */
1522int
1523vcpu_exit(struct vm_run_params *vrp)
1524{
1525	ssize_t sz;
1526	char ch;
1527
1528	switch (vrp->vrp_exit_reason) {
1529	case VMX_EXIT_IO:
1530		vcpu_exit_inout(vrp);
1531		break;
1532	case VMX_EXIT_HLT:
1533		/*
1534		 * XXX handle halted state, no reason to run this vcpu again
1535		 * until a vm interrupt is to be injected
1536		 */
1537		break;
1538	default:
1539		log_warnx("%s: unknown exit reason %d",
1540		    __progname, vrp->vrp_exit_reason);
1541		return (1);
1542	}
1543
1544	/* XXX interrupt priority */
1545	if (vionet_process_rx())
1546		vrp->vrp_injint = 9;
1547
1548	/*
1549	 * Is there a new character available on com1?
1550	 * If so, consume the character, buffer it into the com1 data register
1551	 * assert IRQ4, and set the line status register RXRDY bit.
1552	 *
1553	 * XXX - move all this com intr checking to another function
1554	 */
1555	sz = read(con_fd, &ch, sizeof(char));
1556	if (sz == 1) {
1557		com1_regs.lsr |= LSR_RXRDY;
1558		com1_regs.data = ch;
1559		/* XXX these ier and iir bits should be IER_x and IIR_x */
1560		if (com1_regs.ier & 0x1) {
1561			com1_regs.iir |= (2 << 1);
1562			com1_regs.iir &= ~0x1;
1563		}
1564	}
1565
1566	/*
1567	 * Clear "interrupt pending" by setting IIR low bit to 1 if no
1568	 * interrupts are pending
1569	 */
1570	/* XXX these iir magic numbers should be IIR_x */
1571	if ((com1_regs.iir & ~0x1) == 0x0)
1572		com1_regs.iir = 0x1;
1573
1574	/* If pending interrupt and nothing waiting to be injected, inject */
1575	if ((com1_regs.iir & 0x1) == 0)
1576		if (vrp->vrp_injint == -1)
1577			vrp->vrp_injint = 0x4;
1578	vrp->vrp_continue = 1;
1579
1580	return (0);
1581}
1582
1583/*
1584 * write_page
1585 *
1586 * Pushes a page of data from 'buf' into the guest VM's memory
1587 * at paddr 'dst'.
1588 *
1589 * Parameters:
1590 *  dst: the destination paddr_t in the guest VM to push into.
1591 *      If there is no guest paddr mapping at 'dst', a new page will be
1592 *      faulted in by the VMM (provided 'dst' represents a valid paddr
1593 *      in the guest's address space)
1594 *  buf: page of data to push
1595 *  len: size of 'buf'
1596 *  do_mask: 1 to mask the destination address (for kernel load), 0 to
1597 *      leave 'dst' unmasked
1598 *
1599 * Return values:
1600 *  various return values from ioctl(VMM_IOC_WRITEPAGE), or 0 if no error
1601 *      occurred.
1602 *
1603 * Note - this function only handles GPAs < 4GB.
1604 */
1605int
1606write_page(uint32_t dst, void *buf, uint32_t len, int do_mask)
1607{
1608	struct vm_writepage_params vwp;
1609
1610	/*
1611	 * Mask kernel load addresses to avoid uint32_t -> uint64_t cast
1612	 * errors
1613	 */
1614	if (do_mask)
1615		dst &= 0xFFFFFFF;
1616
1617	vwp.vwp_paddr = (paddr_t)dst;
1618	vwp.vwp_data = buf;
1619	vwp.vwp_vm_id = vm_id;
1620	vwp.vwp_len = len;
1621	if (ioctl(vmm_fd, VMM_IOC_WRITEPAGE, &vwp) < 0) {
1622		log_warn("writepage ioctl failed");
1623		return (errno);
1624	}
1625	return (0);
1626}
1627
1628/*
1629 * read_page
1630 *
1631 * Reads a page of memory at guest paddr 'src' into 'buf'.
1632 *
1633 * Parameters:
1634 *  src: the source paddr_t in the guest VM to read from.
1635 *  buf: destination (local) buffer
1636 *  len: size of 'buf'
1637 *  do_mask: 1 to mask the source address (for kernel load), 0 to
1638 *      leave 'src' unmasked
1639 *
1640 * Return values:
1641 *  various return values from ioctl(VMM_IOC_READPAGE), or 0 if no error
1642 *      occurred.
1643 *
1644 * Note - this function only handles GPAs < 4GB.
1645 */
1646int
1647read_page(uint32_t src, void *buf, uint32_t len, int do_mask)
1648{
1649	struct vm_readpage_params vrp;
1650
1651	/*
1652	 * Mask kernel load addresses to avoid uint32_t -> uint64_t cast
1653	 * errors
1654	 */
1655	if (do_mask)
1656		src &= 0xFFFFFFF;
1657
1658	vrp.vrp_paddr = (paddr_t)src;
1659	vrp.vrp_data = buf;
1660	vrp.vrp_vm_id = vm_id;
1661	vrp.vrp_len = len;
1662	if (ioctl(vmm_fd, VMM_IOC_READPAGE, &vrp) < 0) {
1663		log_warn("readpage ioctl failed");
1664		return (errno);
1665	}
1666	return (0);
1667}
1668