vmd.c revision 1.59
1/*	$OpenBSD: vmd.c,v 1.59 2017/04/25 16:38:23 reyk Exp $	*/
2
3/*
4 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/param.h>	/* nitems */
20#include <sys/queue.h>
21#include <sys/wait.h>
22#include <sys/cdefs.h>
23#include <sys/stat.h>
24#include <sys/tty.h>
25#include <sys/ioctl.h>
26
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <termios.h>
31#include <errno.h>
32#include <event.h>
33#include <fcntl.h>
34#include <pwd.h>
35#include <signal.h>
36#include <syslog.h>
37#include <unistd.h>
38#include <ctype.h>
39#include <pwd.h>
40#include <grp.h>
41
42#include "proc.h"
43#include "vmd.h"
44
45__dead void usage(void);
46
47int	 main(int, char **);
48int	 vmd_configure(void);
49void	 vmd_sighdlr(int sig, short event, void *arg);
50void	 vmd_shutdown(void);
51int	 vmd_control_run(void);
52int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
53int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
54
55struct vmd	*env;
56
57static struct privsep_proc procs[] = {
58	/* Keep "priv" on top as procs[0] */
59	{ "priv",	PROC_PRIV,	NULL, priv },
60	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
61	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm, vmm_shutdown },
62};
63
64/* For the privileged process */
65static struct privsep_proc *proc_priv = &procs[0];
66static struct passwd proc_privpw;
67
68int
69vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
70{
71	struct privsep			*ps = p->p_ps;
72	int				 res = 0, ret = 0, cmd = 0, verbose;
73	unsigned int			 v = 0;
74	struct vmop_create_params	 vmc;
75	struct vmop_id			 vid;
76	struct vm_terminate_params	 vtp;
77	struct vmop_result		 vmr;
78	struct vmd_vm			*vm = NULL;
79	char				*str = NULL;
80	uint32_t			 id = 0;
81
82	switch (imsg->hdr.type) {
83	case IMSG_VMDOP_START_VM_REQUEST:
84		IMSG_SIZE_CHECK(imsg, &vmc);
85		memcpy(&vmc, imsg->data, sizeof(vmc));
86		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_uid);
87		if (vmc.vmc_flags == 0) {
88			/* start an existing VM with pre-configured options */
89			if (!(ret == -1 && errno == EALREADY &&
90			    vm->vm_running == 0)) {
91				res = errno;
92				cmd = IMSG_VMDOP_START_VM_RESPONSE;
93			}
94		} else if (ret != 0) {
95			res = errno;
96			cmd = IMSG_VMDOP_START_VM_RESPONSE;
97		}
98		if (res == 0 &&
99		    config_setvm(ps, vm, imsg->hdr.peerid, vmc.vmc_uid) == -1) {
100			res = errno;
101			cmd = IMSG_VMDOP_START_VM_RESPONSE;
102		}
103		break;
104	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
105		IMSG_SIZE_CHECK(imsg, &vid);
106		memcpy(&vid, imsg->data, sizeof(vid));
107		if ((id = vid.vid_id) == 0) {
108			/* Lookup vm (id) by name */
109			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
110				res = ENOENT;
111				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
112				break;
113			} else if (vm->vm_shutdown) {
114				res = EALREADY;
115				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
116				break;
117			}
118			id = vm->vm_vmid;
119		} else
120			vm = vm_getbyvmid(id);
121		if (vm_checkperm(vm, vid.vid_uid) != 0) {
122			res = EPERM;
123			cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
124			break;
125		}
126		memset(&vtp, 0, sizeof(vtp));
127		vtp.vtp_vm_id = id;
128		if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
129		    imsg->hdr.peerid, -1, &vtp, sizeof(vtp)) == -1)
130			return (-1);
131		break;
132	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
133		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
134		break;
135	case IMSG_VMDOP_LOAD:
136		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
137		str = get_string((uint8_t *)imsg->data,
138		    IMSG_DATA_SIZE(imsg));
139	case IMSG_VMDOP_RELOAD:
140		vmd_reload(0, str);
141		free(str);
142		break;
143	case IMSG_CTL_RESET:
144		IMSG_SIZE_CHECK(imsg, &v);
145		memcpy(&v, imsg->data, sizeof(v));
146		vmd_reload(v, str);
147		break;
148	case IMSG_CTL_VERBOSE:
149		IMSG_SIZE_CHECK(imsg, &verbose);
150		memcpy(&verbose, imsg->data, sizeof(verbose));
151		log_setverbose(verbose);
152
153		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
154		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
155		break;
156	default:
157		return (-1);
158	}
159
160	switch (cmd) {
161	case 0:
162		break;
163	case IMSG_VMDOP_START_VM_RESPONSE:
164	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
165		memset(&vmr, 0, sizeof(vmr));
166		vmr.vmr_result = res;
167		vmr.vmr_id = id;
168		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
169		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
170			return (-1);
171		break;
172	default:
173		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
174		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
175			return (-1);
176		break;
177	}
178
179	return (0);
180}
181
182int
183vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
184{
185	struct vmop_result	 vmr;
186	struct privsep		*ps = p->p_ps;
187	int			 res = 0;
188	struct vmd_vm		*vm;
189	struct vm_create_params	*vcp;
190	struct vmop_info_result	 vir;
191
192	switch (imsg->hdr.type) {
193	case IMSG_VMDOP_START_VM_RESPONSE:
194		IMSG_SIZE_CHECK(imsg, &vmr);
195		memcpy(&vmr, imsg->data, sizeof(vmr));
196		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
197			break;
198		vm->vm_pid = vmr.vmr_pid;
199		vcp = &vm->vm_params.vmc_params;
200		vcp->vcp_id = vmr.vmr_id;
201
202		/*
203		 * If the peerid is not -1, forward the response back to the
204		 * the control socket.  If it is -1, the request originated
205		 * from the parent, not the control socket.
206		 */
207		if (vm->vm_peerid != (uint32_t)-1) {
208			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
209			    sizeof(vmr.vmr_ttyname));
210			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
211			    imsg->hdr.type, vm->vm_peerid, -1,
212			    &vmr, sizeof(vmr)) == -1) {
213				errno = vmr.vmr_result;
214				log_warn("%s: failed to foward vm result",
215				    vcp->vcp_name);
216				vm_remove(vm);
217				return (-1);
218			}
219		}
220
221		if (vmr.vmr_result) {
222			errno = vmr.vmr_result;
223			log_warn("%s: failed to start vm", vcp->vcp_name);
224			vm_remove(vm);
225			break;
226		}
227
228		/* Now configure all the interfaces */
229		if (vm_priv_ifconfig(ps, vm) == -1) {
230			log_warn("%s: failed to configure vm", vcp->vcp_name);
231			vm_remove(vm);
232			break;
233		}
234
235		log_info("%s: started vm %d successfully, tty %s",
236		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
237		break;
238	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
239		IMSG_SIZE_CHECK(imsg, &vmr);
240		memcpy(&vmr, imsg->data, sizeof(vmr));
241		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
242		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
243			break;
244		if (vmr.vmr_result == 0) {
245			/* Mark VM as shutting down */
246			vm->vm_shutdown = 1;
247		}
248		break;
249	case IMSG_VMDOP_TERMINATE_VM_EVENT:
250		IMSG_SIZE_CHECK(imsg, &vmr);
251		memcpy(&vmr, imsg->data, sizeof(vmr));
252		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
253			break;
254		if (vmr.vmr_result == 0) {
255			if (vm->vm_from_config)
256				vm_stop(vm, 0);
257			else
258				vm_remove(vm);
259		} else if (vmr.vmr_result == EAGAIN) {
260			/* Stop VM instance but keep the tty open */
261			vm_stop(vm, 1);
262			config_setvm(ps, vm, (uint32_t)-1, 0);
263		}
264		break;
265	case IMSG_VMDOP_GET_INFO_VM_DATA:
266		IMSG_SIZE_CHECK(imsg, &vir);
267		memcpy(&vir, imsg->data, sizeof(vir));
268		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
269			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
270			if (vm->vm_ttyname != NULL)
271				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
272				    sizeof(vir.vir_ttyname));
273			if (vm->vm_shutdown) {
274				/* XXX there might be a nicer way */
275				(void)strlcat(vir.vir_info.vir_name,
276				    " - stopping",
277				    sizeof(vir.vir_info.vir_name));
278			}
279			/* get the user id who started the vm */
280			vir.vir_uid = vm->vm_uid;
281			vir.vir_gid = vm->vm_params.vmc_gid;
282		}
283		if (proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type,
284		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
285			vm_remove(vm);
286			return (-1);
287		}
288		break;
289	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
290		/*
291		 * PROC_VMM has responded with the *running* VMs, now we
292		 * append the others. These use the special value 0 for their
293		 * kernel id to indicate that they are not running.
294		 */
295		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
296			if (!vm->vm_running) {
297				memset(&vir, 0, sizeof(vir));
298				vir.vir_info.vir_id = vm->vm_vmid;
299				strlcpy(vir.vir_info.vir_name,
300				    vm->vm_params.vmc_params.vcp_name,
301				    VMM_MAX_NAME_LEN);
302				vir.vir_info.vir_memory_size =
303				    vm->vm_params.vmc_params.vcp_memranges[0].vmr_size;
304				vir.vir_info.vir_ncpus =
305				    vm->vm_params.vmc_params.vcp_ncpus;
306				/* get the configured user id for this vm */
307				vir.vir_uid = vm->vm_params.vmc_uid;
308				vir.vir_gid = vm->vm_params.vmc_gid;
309				if (proc_compose_imsg(ps, PROC_CONTROL, -1,
310				    IMSG_VMDOP_GET_INFO_VM_DATA,
311				    imsg->hdr.peerid, -1, &vir,
312				    sizeof(vir)) == -1) {
313					vm_remove(vm);
314					return (-1);
315				}
316			}
317		}
318		IMSG_SIZE_CHECK(imsg, &res);
319		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
320		break;
321	default:
322		return (-1);
323	}
324
325	return (0);
326}
327
328void
329vmd_sighdlr(int sig, short event, void *arg)
330{
331	if (privsep_process != PROC_PARENT)
332		return;
333
334	switch (sig) {
335	case SIGHUP:
336		log_info("%s: reload requested with SIGHUP", __func__);
337
338		/*
339		 * This is safe because libevent uses async signal handlers
340		 * that run in the event loop and not in signal context.
341		 */
342		vmd_reload(0, NULL);
343		break;
344	case SIGPIPE:
345		log_info("%s: ignoring SIGPIPE", __func__);
346		break;
347	case SIGUSR1:
348		log_info("%s: ignoring SIGUSR1", __func__);
349		break;
350	case SIGTERM:
351	case SIGINT:
352		vmd_shutdown();
353		break;
354	default:
355		fatalx("unexpected signal");
356	}
357}
358
359__dead void
360usage(void)
361{
362	extern char *__progname;
363	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
364	    __progname);
365	exit(1);
366}
367
368int
369main(int argc, char **argv)
370{
371	struct privsep		*ps;
372	int			 ch;
373	const char		*conffile = VMD_CONF;
374	enum privsep_procid	 proc_id = PROC_PARENT;
375	int			 proc_instance = 0;
376	const char		*errp, *title = NULL;
377	int			 argc0 = argc;
378
379	/* log to stderr until daemonized */
380	log_init(1, LOG_DAEMON);
381
382	if ((env = calloc(1, sizeof(*env))) == NULL)
383		fatal("calloc: env");
384
385	while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) {
386		switch (ch) {
387		case 'D':
388			if (cmdline_symset(optarg) < 0)
389				log_warnx("could not parse macro definition %s",
390				    optarg);
391			break;
392		case 'd':
393			env->vmd_debug = 2;
394			break;
395		case 'f':
396			conffile = optarg;
397			break;
398		case 'v':
399			env->vmd_verbose++;
400			break;
401		case 'n':
402			env->vmd_noaction = 1;
403			break;
404		case 'P':
405			title = optarg;
406			proc_id = proc_getid(procs, nitems(procs), title);
407			if (proc_id == PROC_MAX)
408				fatalx("invalid process name");
409			break;
410		case 'I':
411			proc_instance = strtonum(optarg, 0,
412			    PROC_MAX_INSTANCES, &errp);
413			if (errp)
414				fatalx("invalid process instance");
415			break;
416		default:
417			usage();
418		}
419	}
420
421	argc -= optind;
422	if (argc > 0)
423		usage();
424
425	if (env->vmd_noaction && !env->vmd_debug)
426		env->vmd_debug = 1;
427
428	/* check for root privileges */
429	if (env->vmd_noaction == 0) {
430		if (geteuid())
431			fatalx("need root privileges");
432	}
433
434	ps = &env->vmd_ps;
435	ps->ps_env = env;
436	env->vmd_fd = -1;
437
438	if (config_init(env) == -1)
439		fatal("failed to initialize configuration");
440
441	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
442		fatal("unknown user %s", VMD_USER);
443
444	/* First proc runs as root without pledge but in default chroot */
445	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
446	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
447
448	/* Open /dev/vmm */
449	if (env->vmd_noaction == 0) {
450		env->vmd_fd = open(VMM_NODE, O_RDWR);
451		if (env->vmd_fd == -1)
452			fatal("%s", VMM_NODE);
453	}
454
455	/* Configure the control socket */
456	ps->ps_csock.cs_name = SOCKET_NAME;
457	TAILQ_INIT(&ps->ps_rcsocks);
458
459	/* Configuration will be parsed after forking the children */
460	env->vmd_conffile = conffile;
461
462	log_init(env->vmd_debug, LOG_DAEMON);
463	log_setverbose(env->vmd_verbose);
464
465	if (env->vmd_noaction)
466		ps->ps_noaction = 1;
467	ps->ps_instance = proc_instance;
468	if (title != NULL)
469		ps->ps_title[proc_id] = title;
470
471	/* only the parent returns */
472	proc_init(ps, procs, nitems(procs), argc0, argv, proc_id);
473
474	log_procinit("parent");
475	if (!env->vmd_debug && daemon(0, 0) == -1)
476		fatal("can't daemonize");
477
478	if (ps->ps_noaction == 0)
479		log_info("startup");
480
481	event_init();
482
483	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
484	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
485	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
486	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
487	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
488
489	signal_add(&ps->ps_evsigint, NULL);
490	signal_add(&ps->ps_evsigterm, NULL);
491	signal_add(&ps->ps_evsighup, NULL);
492	signal_add(&ps->ps_evsigpipe, NULL);
493	signal_add(&ps->ps_evsigusr1, NULL);
494
495	if (!env->vmd_noaction)
496		proc_connect(ps);
497
498	if (vmd_configure() == -1)
499		fatalx("configuration failed");
500
501	event_dispatch();
502
503	log_debug("parent exiting");
504
505	return (0);
506}
507
508int
509vmd_configure(void)
510{
511	struct vmd_vm		*vm;
512	struct vmd_switch	*vsw;
513
514	if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
515		fatal("open %s", PATH_PTMDEV);
516
517	/*
518	 * pledge in the parent process:
519	 * stdio - for malloc and basic I/O including events.
520	 * rpath - for reload to open and read the configuration files.
521	 * wpath - for opening disk images and tap devices.
522	 * tty - for openpty.
523	 * proc - run kill to terminate its children safely.
524	 * sendfd - for disks, interfaces and other fds.
525	 * getpw - lookup user or group id by name.
526	 * chown, fattr - change tty ownership
527	 */
528	if (pledge("stdio rpath wpath proc tty sendfd getpw"
529	    " chown fattr", NULL) == -1)
530		fatal("pledge");
531
532	if (parse_config(env->vmd_conffile) == -1) {
533		proc_kill(&env->vmd_ps);
534		exit(1);
535	}
536
537	if (env->vmd_noaction) {
538		fprintf(stderr, "configuration OK\n");
539		proc_kill(&env->vmd_ps);
540		exit(0);
541	}
542
543	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
544		if (vsw->sw_running)
545			continue;
546		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
547			log_warn("%s: failed to create switch %s",
548			    __func__, vsw->sw_name);
549			switch_remove(vsw);
550			return (-1);
551		}
552	}
553
554	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
555		if (vm->vm_disabled) {
556			log_debug("%s: not creating vm %s (disabled)",
557			    __func__,
558			    vm->vm_params.vmc_params.vcp_name);
559			continue;
560		}
561		if (config_setvm(&env->vmd_ps, vm, -1, 0) == -1)
562			return (-1);
563	}
564
565	/* Send shared global configuration to all children */
566	if (config_setconfig(env) == -1)
567		return (-1);
568
569	return (0);
570}
571
572void
573vmd_reload(unsigned int reset, const char *filename)
574{
575	struct vmd_vm		*vm, *next_vm;
576	struct vmd_switch	*vsw;
577	int			 reload = 0;
578
579	/* Switch back to the default config file */
580	if (filename == NULL || *filename == '\0') {
581		filename = env->vmd_conffile;
582		reload = 1;
583	}
584
585	log_debug("%s: level %d config file %s", __func__, reset, filename);
586
587	if (reset) {
588		/* Purge the configuration */
589		config_purge(env, reset);
590		config_setreset(env, reset);
591	} else {
592		/*
593		 * Load or reload the configuration.
594		 *
595		 * Reloading removes all non-running VMs before processing the
596		 * config file, whereas loading only adds to the existing list
597		 * of VMs.
598		 */
599
600		if (reload) {
601			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, next_vm) {
602				if (vm->vm_running == 0)
603					vm_remove(vm);
604			}
605
606			/* Update shared global configuration in all children */
607			if (config_setconfig(env) == -1)
608				return;
609		}
610
611		if (parse_config(filename) == -1) {
612			log_debug("%s: failed to load config file %s",
613			    __func__, filename);
614		}
615
616		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
617			if (vsw->sw_running)
618				continue;
619			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
620				log_warn("%s: failed to create switch %s",
621				    __func__, vsw->sw_name);
622				switch_remove(vsw);
623				return;
624			}
625		}
626
627		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
628			if (vm->vm_running == 0) {
629				if (vm->vm_disabled) {
630					log_debug("%s: not creating vm %s"
631					    " (disabled)", __func__,
632					    vm->vm_params.vmc_params.vcp_name);
633					continue;
634				}
635				if (config_setvm(&env->vmd_ps, vm, -1, 0) == -1)
636					return;
637			} else {
638				log_debug("%s: not creating vm \"%s\": "
639				    "(running)", __func__,
640				    vm->vm_params.vmc_params.vcp_name);
641			}
642		}
643	}
644}
645
646void
647vmd_shutdown(void)
648{
649	struct vmd_vm *vm, *vm_next;
650
651	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
652		vm_remove(vm);
653	}
654
655	proc_kill(&env->vmd_ps);
656	free(env);
657
658	log_warnx("parent terminating");
659	exit(0);
660}
661
662struct vmd_vm *
663vm_getbyvmid(uint32_t vmid)
664{
665	struct vmd_vm	*vm;
666
667	if (vmid == 0)
668		return (NULL);
669	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
670		if (vm->vm_vmid == vmid)
671			return (vm);
672	}
673
674	return (NULL);
675}
676
677struct vmd_vm *
678vm_getbyid(uint32_t id)
679{
680	struct vmd_vm	*vm;
681
682	if (id == 0)
683		return (NULL);
684	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
685		if (vm->vm_params.vmc_params.vcp_id == id)
686			return (vm);
687	}
688
689	return (NULL);
690}
691
692uint32_t
693vm_id2vmid(uint32_t id, struct vmd_vm *vm)
694{
695	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
696		return (0);
697	dprintf("%s: vmm id %u is vmid %u", __func__,
698	    id, vm->vm_vmid);
699	return (vm->vm_vmid);
700}
701
702uint32_t
703vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
704{
705	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
706		return (0);
707	dprintf("%s: vmid %u is vmm id %u", __func__,
708	    vmid, vm->vm_params.vmc_params.vcp_id);
709	return (vm->vm_params.vmc_params.vcp_id);
710}
711
712struct vmd_vm *
713vm_getbyname(const char *name)
714{
715	struct vmd_vm	*vm;
716
717	if (name == NULL)
718		return (NULL);
719	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
720		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
721			return (vm);
722	}
723
724	return (NULL);
725}
726
727struct vmd_vm *
728vm_getbypid(pid_t pid)
729{
730	struct vmd_vm	*vm;
731
732	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
733		if (vm->vm_pid == pid)
734			return (vm);
735	}
736
737	return (NULL);
738}
739
740void
741vm_stop(struct vmd_vm *vm, int keeptty)
742{
743	unsigned int	 i;
744
745	if (vm == NULL)
746		return;
747
748	vm->vm_running = 0;
749	vm->vm_shutdown = 0;
750
751	if (vm->vm_iev.ibuf.fd != -1) {
752		event_del(&vm->vm_iev.ev);
753		close(vm->vm_iev.ibuf.fd);
754	}
755	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
756		if (vm->vm_disks[i] != -1) {
757			close(vm->vm_disks[i]);
758			vm->vm_disks[i] = -1;
759		}
760	}
761	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
762		if (vm->vm_ifs[i].vif_fd != -1) {
763			close(vm->vm_ifs[i].vif_fd);
764			vm->vm_ifs[i].vif_fd = -1;
765		}
766		free(vm->vm_ifs[i].vif_name);
767		free(vm->vm_ifs[i].vif_switch);
768		free(vm->vm_ifs[i].vif_group);
769		vm->vm_ifs[i].vif_name = NULL;
770		vm->vm_ifs[i].vif_switch = NULL;
771		vm->vm_ifs[i].vif_group = NULL;
772	}
773	if (vm->vm_kernel != -1) {
774		close(vm->vm_kernel);
775		vm->vm_kernel = -1;
776	}
777	vm->vm_uid = 0;
778	if (!keeptty)
779		vm_closetty(vm);
780}
781
782void
783vm_remove(struct vmd_vm *vm)
784{
785	if (vm == NULL)
786		return;
787
788	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
789	vm_stop(vm, 0);
790	free(vm);
791}
792
793int
794vm_register(struct privsep *ps, struct vmop_create_params *vmc,
795    struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
796{
797	struct vmd_vm		*vm = NULL;
798	struct vm_create_params	*vcp = &vmc->vmc_params;
799	static const uint8_t	 zero_mac[ETHER_ADDR_LEN];
800	uint32_t		 rng;
801	unsigned int		 i;
802	struct vmd_switch	*sw;
803
804	errno = 0;
805	*ret_vm = NULL;
806
807	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
808	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
809		if (vm_checkperm(vm, uid) != 0 || vmc->vmc_flags != 0) {
810			errno = EPERM;
811			goto fail;
812		}
813		*ret_vm = vm;
814		errno = EALREADY;
815		goto fail;
816	}
817
818	/*
819	 * non-root users can only start existing VMs
820	 * XXX there could be a mechanism to allow overriding some options
821	 */
822	if (vm_checkperm(NULL, uid) != 0) {
823		errno = EPERM;
824		goto fail;
825	}
826	if (vmc->vmc_flags == 0) {
827		errno = ENOENT;
828		goto fail;
829	}
830	if (vcp->vcp_ncpus == 0)
831		vcp->vcp_ncpus = 1;
832	if (vcp->vcp_memranges[0].vmr_size == 0)
833		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
834	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
835		log_warnx("invalid number of CPUs");
836		goto fail;
837	} else if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) {
838		log_warnx("invalid number of disks");
839		goto fail;
840	} else if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) {
841		log_warnx("invalid number of interfaces");
842		goto fail;
843	} else if (strlen(vcp->vcp_kernel) == 0 && vcp->vcp_ndisks == 0) {
844		log_warnx("no kernel or disk specified");
845		goto fail;
846	} else if (strlen(vcp->vcp_name) == 0) {
847		log_warnx("invalid VM name");
848		goto fail;
849	}
850
851	if ((vm = calloc(1, sizeof(*vm))) == NULL)
852		goto fail;
853
854	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
855	vmc = &vm->vm_params;
856	vcp = &vmc->vmc_params;
857	vm->vm_pid = -1;
858	vm->vm_tty = -1;
859
860	for (i = 0; i < vcp->vcp_ndisks; i++)
861		vm->vm_disks[i] = -1;
862	for (i = 0; i < vcp->vcp_nnics; i++) {
863		vm->vm_ifs[i].vif_fd = -1;
864
865		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
866			/* inherit per-interface flags from the switch */
867			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
868		}
869
870		/*
871		 * If the MAC address is zero, always randomize it in vmd(8)
872		 * because we cannot rely on the guest OS to do the right
873		 * thing like OpenBSD does.  Based on ether_fakeaddr()
874		 * from the kernel, incremented by one to differentiate
875		 * the source.
876		 */
877		if (memcmp(zero_mac, &vcp->vcp_macs[i], ETHER_ADDR_LEN) == 0) {
878			rng = arc4random();
879			vcp->vcp_macs[i][0] = 0xfe;
880			vcp->vcp_macs[i][1] = 0xe1;
881			vcp->vcp_macs[i][2] = 0xba + 1;
882			vcp->vcp_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
883			vcp->vcp_macs[i][4] = rng;
884			vcp->vcp_macs[i][5] = rng >> 8;
885		}
886	}
887	vm->vm_kernel = -1;
888	vm->vm_iev.ibuf.fd = -1;
889
890	if (++env->vmd_nvm == 0)
891		fatalx("too many vms");
892
893	/* Assign a new internal Id if not specified */
894	vm->vm_vmid = id == 0 ? env->vmd_nvm : id;
895
896	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
897
898	*ret_vm = vm;
899	return (0);
900 fail:
901	if (errno == 0)
902		errno = EINVAL;
903	return (-1);
904}
905
906int
907vm_checkperm(struct vmd_vm *vm, uid_t uid)
908{
909	struct group	*gr;
910	struct passwd	*pw;
911	char		**grmem;
912
913	/* root has no restrictions */
914	if (uid == 0)
915		return (0);
916
917	if (vm == NULL)
918		return (-1);
919
920	/* check supplementary groups */
921	if (vm->vm_params.vmc_gid != -1 &&
922	    (pw = getpwuid(uid)) != NULL &&
923	    (gr = getgrgid(vm->vm_params.vmc_gid)) != NULL) {
924		for (grmem = gr->gr_mem; *grmem; grmem++)
925			if (strcmp(*grmem, pw->pw_name) == 0)
926				return (0);
927	}
928
929	/* check user */
930	if ((vm->vm_running && vm->vm_uid == uid) ||
931	    (!vm->vm_running && vm->vm_params.vmc_uid == uid))
932		return (0);
933
934	return (-1);
935}
936
937int
938vm_opentty(struct vmd_vm *vm)
939{
940	struct ptmget		 ptm;
941	struct stat		 st;
942	struct group		*gr;
943	uid_t			 uid;
944	gid_t			 gid;
945	mode_t			 mode;
946
947	/*
948	 * Open tty with pre-opened PTM fd
949	 */
950	if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1))
951		return (-1);
952
953	vm->vm_tty = ptm.cfd;
954	close(ptm.sfd);
955	if ((vm->vm_ttyname = strdup(ptm.sn)) == NULL)
956		goto fail;
957
958	uid = vm->vm_uid;
959	gid = vm->vm_params.vmc_gid;
960
961	if (vm->vm_params.vmc_gid != -1) {
962		mode = 0660;
963	} else if ((gr = getgrnam("tty")) != NULL) {
964		gid = gr->gr_gid;
965		mode = 0620;
966	} else {
967		mode = 0600;
968		gid = 0;
969	}
970
971	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
972	    __func__, vm->vm_params.vmc_params.vcp_name,
973	    vm->vm_ttyname, uid, gid, mode);
974
975	/*
976	 * Change ownership and mode of the tty as required.
977	 * Loosely based on the implementation of sshpty.c
978	 */
979	if (stat(vm->vm_ttyname, &st) == -1)
980		goto fail;
981
982	if (st.st_uid != uid || st.st_gid != gid) {
983		if (chown(vm->vm_ttyname, uid, gid) == -1) {
984			log_warn("chown %s %d %d failed, uid %d",
985			    vm->vm_ttyname, uid, gid, getuid());
986
987			/* Ignore failure on read-only filesystems */
988			if (!((errno == EROFS) &&
989			    (st.st_uid == uid || st.st_uid == 0)))
990				goto fail;
991		}
992	}
993
994	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
995		if (chmod(vm->vm_ttyname, mode) == -1) {
996			log_warn("chmod %s %o failed, uid %d",
997			    vm->vm_ttyname, mode, getuid());
998
999			/* Ignore failure on read-only filesystems */
1000			if (!((errno == EROFS) &&
1001			    (st.st_uid == uid || st.st_uid == 0)))
1002				goto fail;
1003		}
1004	}
1005
1006	return (0);
1007 fail:
1008	vm_closetty(vm);
1009	return (-1);
1010}
1011
1012void
1013vm_closetty(struct vmd_vm *vm)
1014{
1015	if (vm->vm_tty != -1) {
1016		/* Release and close the tty */
1017		if (fchown(vm->vm_tty, 0, 0) == -1)
1018			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1019		if (fchmod(vm->vm_tty, 0666) == -1)
1020			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1021		close(vm->vm_tty);
1022		vm->vm_tty = -1;
1023	}
1024	free(vm->vm_ttyname);
1025	vm->vm_ttyname = NULL;
1026}
1027
1028void
1029switch_remove(struct vmd_switch *vsw)
1030{
1031	struct vmd_if	*vif;
1032
1033	if (vsw == NULL)
1034		return;
1035
1036	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1037
1038	while ((vif = TAILQ_FIRST(&vsw->sw_ifs)) != NULL) {
1039		free(vif->vif_name);
1040		free(vif->vif_switch);
1041		TAILQ_REMOVE(&vsw->sw_ifs, vif, vif_entry);
1042		free(vif);
1043	}
1044
1045	free(vsw->sw_group);
1046	free(vsw->sw_name);
1047	free(vsw);
1048}
1049
1050struct vmd_switch *
1051switch_getbyname(const char *name)
1052{
1053	struct vmd_switch	*vsw;
1054
1055	if (name == NULL)
1056		return (NULL);
1057	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1058		if (strcmp(vsw->sw_name, name) == 0)
1059			return (vsw);
1060	}
1061
1062	return (NULL);
1063}
1064
1065char *
1066get_string(uint8_t *ptr, size_t len)
1067{
1068	size_t	 i;
1069
1070	for (i = 0; i < len; i++)
1071		if (!isprint(ptr[i]))
1072			break;
1073
1074	return strndup(ptr, i);
1075}
1076
1077uint32_t
1078prefixlen2mask(uint8_t prefixlen)
1079{
1080	if (prefixlen == 0)
1081		return (0);
1082
1083	if (prefixlen > 32)
1084		prefixlen = 32;
1085
1086	return (htonl(0xffffffff << (32 - prefixlen)));
1087}
1088