vmd.c revision 1.156
1/*	$OpenBSD: vmd.c,v 1.156 2024/04/08 12:48:26 tobhe Exp $	*/
2
3/*
4 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/types.h>
20#include <sys/queue.h>
21#include <sys/wait.h>
22#include <sys/stat.h>
23#include <sys/sysctl.h>
24#include <sys/tty.h>
25#include <sys/ttycom.h>
26#include <sys/ioctl.h>
27
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31#include <termios.h>
32#include <errno.h>
33#include <event.h>
34#include <fcntl.h>
35#include <pwd.h>
36#include <signal.h>
37#include <syslog.h>
38#include <unistd.h>
39#include <util.h>
40#include <ctype.h>
41#include <grp.h>
42
43#include <machine/specialreg.h>
44#include <machine/vmmvar.h>
45
46#include "proc.h"
47#include "atomicio.h"
48#include "vmd.h"
49
50__dead void usage(void);
51
52int	 main(int, char **);
53int	 vmd_configure(void);
54void	 vmd_sighdlr(int sig, short event, void *arg);
55void	 vmd_shutdown(void);
56int	 vmd_control_run(void);
57int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
58int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
59int	 vmd_dispatch_agentx(int, struct privsep_proc *, struct imsg *);
60int	 vmd_dispatch_priv(int, struct privsep_proc *, struct imsg *);
61int	 vmd_check_vmh(struct vm_dump_header *);
62
63int	 vm_instance(struct privsep *, struct vmd_vm **,
64	    struct vmop_create_params *, uid_t);
65int	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
66int	 vm_claimid(const char *, int, uint32_t *);
67void	 start_vm_batch(int, short, void*);
68
69static inline void vm_terminate(struct vmd_vm *, const char *);
70
71struct vmd	*env;
72
73static struct privsep_proc procs[] = {
74	/* Keep "priv" on top as procs[0] */
75	{ "priv",	PROC_PRIV,	vmd_dispatch_priv, priv },
76	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
77	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm,
78	  vmm_shutdown, "/" },
79	{ "agentx", 	PROC_AGENTX,	vmd_dispatch_agentx, vm_agentx,
80	  vm_agentx_shutdown, "/" }
81};
82
83enum privsep_procid privsep_process;
84
85struct event staggered_start_timer;
86
87/* For the privileged process */
88static struct privsep_proc *proc_priv = &procs[0];
89static struct passwd proc_privpw;
90static const uint8_t zero_mac[ETHER_ADDR_LEN];
91
92const char		 default_conffile[] = VMD_CONF;
93const char		*conffile = default_conffile;
94
95int
96vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
97{
98	struct privsep			*ps = p->p_ps;
99	int				 res = 0, ret = 0, cmd = 0, verbose;
100	int				 ifd;
101	unsigned int			 v = 0, flags;
102	struct vmop_create_params	 vmc;
103	struct vmop_id			 vid;
104	struct vmop_result		 vmr;
105	struct vm_dump_header		 vmh;
106	struct vmd_vm			*vm = NULL;
107	char				*str = NULL;
108	uint32_t			 id = 0;
109	struct control_sock		*rcs;
110
111	switch (imsg->hdr.type) {
112	case IMSG_VMDOP_START_VM_REQUEST:
113		IMSG_SIZE_CHECK(imsg, &vmc);
114		memcpy(&vmc, imsg->data, sizeof(vmc));
115		vmc.vmc_kernel = imsg_get_fd(imsg);
116
117		/* Try registering our VM in our list of known VMs. */
118		if (vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid)) {
119			res = errno;
120
121			/* Did we have a failure during lookup of a parent? */
122			if (vm == NULL) {
123				cmd = IMSG_VMDOP_START_VM_RESPONSE;
124				break;
125			}
126
127			/* Does the VM already exist? */
128			if (res == EALREADY) {
129				/* Is it already running? */
130				if (vm->vm_state & VM_STATE_RUNNING) {
131					cmd = IMSG_VMDOP_START_VM_RESPONSE;
132					break;
133				}
134
135				/* If not running, are our flags ok? */
136				if (vmc.vmc_flags &&
137				    vmc.vmc_flags != VMOP_CREATE_KERNEL) {
138					cmd = IMSG_VMDOP_START_VM_RESPONSE;
139					break;
140				}
141			}
142			res = 0;
143		}
144
145		/* Try to start the launch of the VM. */
146		res = config_setvm(ps, vm, imsg->hdr.peerid,
147		    vm->vm_params.vmc_owner.uid);
148		if (res)
149			cmd = IMSG_VMDOP_START_VM_RESPONSE;
150		break;
151	case IMSG_VMDOP_WAIT_VM_REQUEST:
152	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
153		IMSG_SIZE_CHECK(imsg, &vid);
154		memcpy(&vid, imsg->data, sizeof(vid));
155		flags = vid.vid_flags;
156		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
157
158		if ((id = vid.vid_id) == 0) {
159			/* Lookup vm (id) by name */
160			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
161				res = ENOENT;
162				break;
163			}
164			id = vm->vm_vmid;
165		} else if ((vm = vm_getbyvmid(id)) == NULL) {
166			res = ENOENT;
167			break;
168		}
169
170		/* Validate curent state of vm */
171		if ((vm->vm_state & VM_STATE_SHUTDOWN) &&
172			    (flags & VMOP_FORCE) == 0) {
173				res = EALREADY;
174				break;
175		} else if (!(vm->vm_state & VM_STATE_RUNNING)) {
176			res = EINVAL;
177			break;
178		} else if (vm_checkperm(vm, &vm->vm_params.vmc_owner, vid.vid_uid)) {
179			res = EPERM;
180			break;
181		}
182
183		/* Only relay TERMINATION requests, not WAIT requests */
184		if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_REQUEST) {
185			memset(&vid, 0, sizeof(vid));
186			vid.vid_id = id;
187			vid.vid_flags = flags;
188
189			if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
190				imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
191				return (-1);
192		}
193		break;
194	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
195		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
196		break;
197	case IMSG_VMDOP_LOAD:
198		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
199		str = get_string((uint8_t *)imsg->data,
200		    IMSG_DATA_SIZE(imsg));
201	case IMSG_VMDOP_RELOAD:
202		if (vmd_reload(0, str) == -1)
203			cmd = IMSG_CTL_FAIL;
204		else
205			cmd = IMSG_CTL_OK;
206		free(str);
207		break;
208	case IMSG_CTL_RESET:
209		IMSG_SIZE_CHECK(imsg, &v);
210		memcpy(&v, imsg->data, sizeof(v));
211		if (vmd_reload(v, NULL) == -1)
212			cmd = IMSG_CTL_FAIL;
213		else
214			cmd = IMSG_CTL_OK;
215		break;
216	case IMSG_CTL_VERBOSE:
217		IMSG_SIZE_CHECK(imsg, &verbose);
218		memcpy(&verbose, imsg->data, sizeof(verbose));
219		log_setverbose(verbose);
220
221		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
222		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
223		cmd = IMSG_CTL_OK;
224		break;
225	case IMSG_VMDOP_PAUSE_VM:
226	case IMSG_VMDOP_UNPAUSE_VM:
227		IMSG_SIZE_CHECK(imsg, &vid);
228		memcpy(&vid, imsg->data, sizeof(vid));
229		if (vid.vid_id == 0) {
230			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
231				res = ENOENT;
232				cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
233				    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
234				    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
235				break;
236			} else {
237				vid.vid_id = vm->vm_vmid;
238			}
239		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
240			res = ENOENT;
241			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
242			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
243			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
244			break;
245		}
246		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
247		    vid.vid_uid) != 0) {
248			res = EPERM;
249			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
250			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
251			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
252			break;
253		}
254		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
255		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
256		break;
257	case IMSG_VMDOP_SEND_VM_REQUEST:
258		IMSG_SIZE_CHECK(imsg, &vid);
259		memcpy(&vid, imsg->data, sizeof(vid));
260		id = vid.vid_id;
261		ifd = imsg_get_fd(imsg);
262		if (vid.vid_id == 0) {
263			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
264				res = ENOENT;
265				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
266				close(ifd);
267				break;
268			} else {
269				vid.vid_id = vm->vm_vmid;
270			}
271		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
272			res = ENOENT;
273			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
274			close(ifd);
275			break;
276		}
277		vmr.vmr_id = vid.vid_id;
278		log_debug("%s: sending fd to vmm", __func__);
279		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
280		    imsg->hdr.peerid, ifd, &vid, sizeof(vid));
281		break;
282	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
283		IMSG_SIZE_CHECK(imsg, &vid);
284		memcpy(&vid, imsg->data, sizeof(vid));
285		ifd = imsg_get_fd(imsg);
286		if (ifd == -1) {
287			log_warnx("%s: invalid fd", __func__);
288			return (-1);
289		}
290		if (atomicio(read, ifd, &vmh, sizeof(vmh)) != sizeof(vmh)) {
291			log_warnx("%s: error reading vmh from received vm",
292			    __func__);
293			res = EIO;
294			close(ifd);
295			cmd = IMSG_VMDOP_START_VM_RESPONSE;
296			break;
297		}
298
299		if (vmd_check_vmh(&vmh)) {
300			res = ENOENT;
301			close(ifd);
302			cmd = IMSG_VMDOP_START_VM_RESPONSE;
303			break;
304		}
305		if (atomicio(read, ifd, &vmc, sizeof(vmc)) != sizeof(vmc)) {
306			log_warnx("%s: error reading vmc from received vm",
307			    __func__);
308			res = EIO;
309			close(ifd);
310			cmd = IMSG_VMDOP_START_VM_RESPONSE;
311			break;
312		}
313		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
314		    sizeof(vmc.vmc_params.vcp_name));
315		vmc.vmc_params.vcp_id = 0;
316
317		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
318		if (ret != 0) {
319			res = errno;
320			cmd = IMSG_VMDOP_START_VM_RESPONSE;
321			close(ifd);
322		} else {
323			vm->vm_state |= VM_STATE_RECEIVED;
324			config_setvm(ps, vm, imsg->hdr.peerid,
325			    vmc.vmc_owner.uid);
326			log_debug("%s: sending fd to vmm", __func__);
327			proc_compose_imsg(ps, PROC_VMM, -1,
328			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, ifd,
329			    NULL, 0);
330		}
331		break;
332	case IMSG_VMDOP_DONE:
333		control_reset(&ps->ps_csock);
334		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
335			control_reset(rcs);
336		cmd = 0;
337		break;
338	default:
339		return (-1);
340	}
341
342	switch (cmd) {
343	case 0:
344		break;
345	case IMSG_VMDOP_START_VM_RESPONSE:
346	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
347		memset(&vmr, 0, sizeof(vmr));
348		vmr.vmr_result = res;
349		vmr.vmr_id = id;
350		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
351		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
352			return (-1);
353		break;
354	default:
355		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
356		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
357			return (-1);
358		break;
359	}
360
361	return (0);
362}
363
364int
365vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
366{
367	struct vmop_result	 vmr;
368	struct privsep		*ps = p->p_ps;
369	int			 res = 0;
370	struct vmd_vm		*vm;
371	struct vm_create_params	*vcp;
372	struct vmop_info_result	 vir;
373
374	switch (imsg->hdr.type) {
375	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
376		IMSG_SIZE_CHECK(imsg, &vmr);
377		memcpy(&vmr, imsg->data, sizeof(vmr));
378		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
379			break;
380		proc_compose_imsg(ps, PROC_CONTROL, -1,
381		    imsg->hdr.type, imsg->hdr.peerid, -1,
382		    imsg->data, sizeof(imsg->data));
383		log_info("%s: paused vm %d successfully",
384		    vm->vm_params.vmc_params.vcp_name,
385		    vm->vm_vmid);
386		vm->vm_state |= VM_STATE_PAUSED;
387		break;
388	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
389		IMSG_SIZE_CHECK(imsg, &vmr);
390		memcpy(&vmr, imsg->data, sizeof(vmr));
391		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
392			break;
393		proc_compose_imsg(ps, PROC_CONTROL, -1,
394		    imsg->hdr.type, imsg->hdr.peerid, -1,
395		    imsg->data, sizeof(imsg->data));
396		log_info("%s: unpaused vm %d successfully.",
397		    vm->vm_params.vmc_params.vcp_name,
398		    vm->vm_vmid);
399		vm->vm_state &= ~VM_STATE_PAUSED;
400		break;
401	case IMSG_VMDOP_START_VM_RESPONSE:
402		IMSG_SIZE_CHECK(imsg, &vmr);
403		memcpy(&vmr, imsg->data, sizeof(vmr));
404		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
405			break;
406		vm->vm_pid = vmr.vmr_pid;
407		vcp = &vm->vm_params.vmc_params;
408		vcp->vcp_id = vmr.vmr_id;
409
410		/*
411		 * If the peerid is not -1, forward the response back to the
412		 * the control socket.  If it is -1, the request originated
413		 * from the parent, not the control socket.
414		 */
415		if (vm->vm_peerid != (uint32_t)-1) {
416			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
417			    sizeof(vmr.vmr_ttyname));
418			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
419			    imsg->hdr.type, vm->vm_peerid, -1,
420			    &vmr, sizeof(vmr)) == -1) {
421				errno = vmr.vmr_result;
422				log_warn("%s: failed to forward vm result",
423				    vcp->vcp_name);
424				vm_terminate(vm, __func__);
425				return (-1);
426			}
427		}
428
429		if (vmr.vmr_result) {
430			log_warnx("%s: failed to start vm", vcp->vcp_name);
431			vm_terminate(vm, __func__);
432			errno = vmr.vmr_result;
433			break;
434		}
435
436		/* Now configure all the interfaces */
437		if (vm_priv_ifconfig(ps, vm) == -1) {
438			log_warn("%s: failed to configure vm", vcp->vcp_name);
439			vm_terminate(vm, __func__);
440			break;
441		}
442
443		log_info("started %s (vm %d) successfully, tty %s",
444		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
445		break;
446	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
447		IMSG_SIZE_CHECK(imsg, &vmr);
448		memcpy(&vmr, imsg->data, sizeof(vmr));
449
450		if (vmr.vmr_result) {
451			DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
452			    __func__, vmr.vmr_id);
453			proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
454		} else {
455			if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
456				break;
457			/* Mark VM as shutting down */
458			vm->vm_state |= VM_STATE_SHUTDOWN;
459		}
460		break;
461	case IMSG_VMDOP_SEND_VM_RESPONSE:
462		IMSG_SIZE_CHECK(imsg, &vmr);
463		memcpy(&vmr, imsg->data, sizeof(vmr));
464		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
465			break;
466		if (!vmr.vmr_result) {
467			log_info("%s: sent vm %d successfully.",
468			    vm->vm_params.vmc_params.vcp_name,
469			    vm->vm_vmid);
470			vm_terminate(vm, __func__);
471		}
472
473		/* Send a response if a control client is waiting for it */
474		if (imsg->hdr.peerid != (uint32_t)-1) {
475			/* the error is meaningless for deferred responses */
476			vmr.vmr_result = 0;
477
478			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
479			    IMSG_VMDOP_SEND_VM_RESPONSE,
480			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
481				return (-1);
482		}
483		break;
484	case IMSG_VMDOP_TERMINATE_VM_EVENT:
485		IMSG_SIZE_CHECK(imsg, &vmr);
486		memcpy(&vmr, imsg->data, sizeof(vmr));
487		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
488		    __func__, vmr.vmr_id, vmr.vmr_result);
489		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
490			log_debug("%s: vm %d is no longer available",
491			    __func__, vmr.vmr_id);
492			break;
493		}
494		if (vmr.vmr_result != EAGAIN ||
495		    vm->vm_params.vmc_bootdevice) {
496			vm_terminate(vm, __func__);
497		} else {
498			/* Stop VM instance but keep the tty open */
499			vm_stop(vm, 1, __func__);
500			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
501		}
502
503		/* The error is meaningless for deferred responses */
504		vmr.vmr_result = 0;
505
506		if (proc_compose_imsg(ps, PROC_CONTROL, -1,
507			IMSG_VMDOP_TERMINATE_VM_EVENT,
508			imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
509			return (-1);
510		break;
511	case IMSG_VMDOP_GET_INFO_VM_DATA:
512		IMSG_SIZE_CHECK(imsg, &vir);
513		memcpy(&vir, imsg->data, sizeof(vir));
514		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
515			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
516			if (vm->vm_ttyname[0] != '\0')
517				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
518				    sizeof(vir.vir_ttyname));
519			log_debug("%s: running vm: %d, vm_state: 0x%x",
520			    __func__, vm->vm_vmid, vm->vm_state);
521			vir.vir_state = vm->vm_state;
522			/* get the user id who started the vm */
523			vir.vir_uid = vm->vm_uid;
524			vir.vir_gid = vm->vm_params.vmc_owner.gid;
525		}
526		if (proc_compose_imsg(ps,
527		    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
528		    PROC_AGENTX : PROC_CONTROL, -1, imsg->hdr.type,
529		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
530			if (vm)
531				vm_terminate(vm, __func__);
532			return (-1);
533		}
534		break;
535	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
536		/*
537		 * PROC_VMM has responded with the *running* VMs, now we
538		 * append the others. These use the special value 0 for their
539		 * kernel id to indicate that they are not running.
540		 */
541		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
542			if (!(vm->vm_state & VM_STATE_RUNNING)) {
543				memset(&vir, 0, sizeof(vir));
544				vir.vir_info.vir_id = vm->vm_vmid;
545				strlcpy(vir.vir_info.vir_name,
546				    vm->vm_params.vmc_params.vcp_name,
547				    VMM_MAX_NAME_LEN);
548				vir.vir_info.vir_memory_size =
549				    vm->vm_params.vmc_params.
550				    vcp_memranges[0].vmr_size;
551				vir.vir_info.vir_ncpus =
552				    vm->vm_params.vmc_params.vcp_ncpus;
553				/* get the configured user id for this vm */
554				vir.vir_uid = vm->vm_params.vmc_owner.uid;
555				vir.vir_gid = vm->vm_params.vmc_owner.gid;
556				log_debug("%s: vm: %d, vm_state: 0x%x",
557				    __func__, vm->vm_vmid, vm->vm_state);
558				vir.vir_state = vm->vm_state;
559				if (proc_compose_imsg(ps,
560				    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
561				    PROC_AGENTX : PROC_CONTROL, -1,
562				    IMSG_VMDOP_GET_INFO_VM_DATA,
563				    imsg->hdr.peerid, -1, &vir,
564				    sizeof(vir)) == -1) {
565					log_debug("%s: GET_INFO_VM_END failed",
566					    __func__);
567					vm_terminate(vm, __func__);
568					return (-1);
569				}
570			}
571		}
572		IMSG_SIZE_CHECK(imsg, &res);
573		proc_forward_imsg(ps, imsg,
574		    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
575		    PROC_AGENTX : PROC_CONTROL, -1);
576		break;
577	default:
578		return (-1);
579	}
580
581	return (0);
582}
583
584int
585vmd_dispatch_agentx(int fd, struct privsep_proc *p, struct imsg *imsg)
586{
587	struct privsep			*ps = p->p_ps;
588
589	switch (imsg->hdr.type) {
590	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
591		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
592		return (0);
593	default:
594		break;
595	}
596	return (-1);
597}
598
599int
600vmd_dispatch_priv(int fd, struct privsep_proc *p, struct imsg *imsg)
601{
602	struct vmop_addr_result	 var;
603
604	switch (imsg->hdr.type) {
605	case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
606		IMSG_SIZE_CHECK(imsg, &var);
607		memcpy(&var, imsg->data, sizeof(var));
608		proc_forward_imsg(p->p_ps, imsg, PROC_VMM, -1);
609		break;
610	default:
611		return (-1);
612	}
613
614	return (0);
615}
616
617int
618vmd_check_vmh(struct vm_dump_header *vmh)
619{
620	int i;
621	unsigned int code, leaf;
622	unsigned int a, b, c, d;
623
624	if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
625		log_warnx("%s: incompatible dump signature", __func__);
626		return (-1);
627	}
628
629	if (vmh->vmh_version != VM_DUMP_VERSION) {
630		log_warnx("%s: incompatible dump version", __func__);
631		return (-1);
632	}
633
634	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
635		code = vmh->vmh_cpuids[i].code;
636		leaf = vmh->vmh_cpuids[i].leaf;
637		if (leaf != 0x00) {
638			log_debug("%s: invalid leaf 0x%x for code 0x%x",
639			    __func__, leaf, code);
640			return (-1);
641		}
642
643		switch (code) {
644		case 0x00:
645			CPUID_LEAF(code, leaf, a, b, c, d);
646			if (vmh->vmh_cpuids[i].a > a) {
647				log_debug("%s: incompatible cpuid level",
648				    __func__);
649				return (-1);
650			}
651			if (!(vmh->vmh_cpuids[i].b == b &&
652			    vmh->vmh_cpuids[i].c == c &&
653			    vmh->vmh_cpuids[i].d == d)) {
654				log_debug("%s: incompatible cpu brand",
655				    __func__);
656				return (-1);
657			}
658			break;
659
660		case 0x01:
661			CPUID_LEAF(code, leaf, a, b, c, d);
662			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
663			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
664				log_debug("%s: incompatible cpu features "
665				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
666				    code, leaf);
667				return (-1);
668			}
669			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
670			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
671				log_debug("%s: incompatible cpu features "
672				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
673				    code, leaf);
674				return (-1);
675			}
676			break;
677
678		case 0x07:
679			CPUID_LEAF(code, leaf, a, b, c, d);
680			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
681			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
682				log_debug("%s: incompatible cpu features "
683				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
684				    code, leaf);
685				return (-1);
686			}
687			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
688			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
689				log_debug("%s: incompatible cpu features "
690				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
691				    code, leaf);
692				return (-1);
693			}
694			break;
695
696		case 0x0d:
697			CPUID_LEAF(code, leaf, a, b, c, d);
698			if (vmh->vmh_cpuids[i].b > b) {
699				log_debug("%s: incompatible cpu: insufficient "
700				    "max save area for enabled XCR0 features",
701				    __func__);
702				return (-1);
703			}
704			if (vmh->vmh_cpuids[i].c > c) {
705				log_debug("%s: incompatible cpu: insufficient "
706				    "max save area for supported XCR0 features",
707				    __func__);
708				return (-1);
709			}
710			break;
711
712		case 0x80000001:
713			CPUID_LEAF(code, leaf, a, b, c, d);
714			if ((vmh->vmh_cpuids[i].a & a) !=
715			    vmh->vmh_cpuids[i].a) {
716				log_debug("%s: incompatible cpu features "
717				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
718				    code, leaf);
719				return (-1);
720			}
721			if ((vmh->vmh_cpuids[i].c & c) !=
722			    vmh->vmh_cpuids[i].c) {
723				log_debug("%s: incompatible cpu features "
724				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
725				    code, leaf);
726				return (-1);
727			}
728			if ((vmh->vmh_cpuids[i].d & d) !=
729			    vmh->vmh_cpuids[i].d) {
730				log_debug("%s: incompatible cpu features "
731				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
732				    code, leaf);
733				return (-1);
734			}
735			break;
736
737		default:
738			log_debug("%s: unknown code 0x%x", __func__, code);
739			return (-1);
740		}
741	}
742
743	return (0);
744}
745
746void
747vmd_sighdlr(int sig, short event, void *arg)
748{
749	if (privsep_process != PROC_PARENT)
750		return;
751	log_debug("%s: handling signal", __func__);
752
753	switch (sig) {
754	case SIGHUP:
755		log_info("%s: reload requested with SIGHUP", __func__);
756
757		/*
758		 * This is safe because libevent uses async signal handlers
759		 * that run in the event loop and not in signal context.
760		 */
761		(void)vmd_reload(0, NULL);
762		break;
763	case SIGPIPE:
764		log_info("%s: ignoring SIGPIPE", __func__);
765		break;
766	case SIGUSR1:
767		log_info("%s: ignoring SIGUSR1", __func__);
768		break;
769	case SIGTERM:
770	case SIGINT:
771		vmd_shutdown();
772		break;
773	default:
774		fatalx("unexpected signal");
775	}
776}
777
778__dead void
779usage(void)
780{
781	extern char *__progname;
782	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
783	    __progname);
784	exit(1);
785}
786
787int
788main(int argc, char **argv)
789{
790	struct privsep		*ps;
791	int			 ch;
792	enum privsep_procid	 proc_id = PROC_PARENT;
793	int			 proc_instance = 0, vm_launch = 0;
794	int			 vmm_fd = -1, vm_fd = -1;
795	const char		*errp, *title = NULL;
796	int			 argc0 = argc;
797	char			 dev_type = '\0';
798
799	log_init(0, LOG_DAEMON);
800
801	if ((env = calloc(1, sizeof(*env))) == NULL)
802		fatal("calloc: env");
803	env->vmd_fd = -1;
804	env->vmd_fd6 = -1;
805
806	while ((ch = getopt(argc, argv, "D:P:I:V:X:df:i:nt:vp:")) != -1) {
807		switch (ch) {
808		case 'D':
809			if (cmdline_symset(optarg) < 0)
810				log_warnx("could not parse macro definition %s",
811				    optarg);
812			break;
813		case 'd':
814			env->vmd_debug = 2;
815			break;
816		case 'f':
817			conffile = optarg;
818			break;
819		case 'v':
820			env->vmd_verbose++;
821			break;
822		/* vmd fork/exec */
823		case 'n':
824			env->vmd_noaction = 1;
825			break;
826		case 'P':
827			title = optarg;
828			proc_id = proc_getid(procs, nitems(procs), title);
829			if (proc_id == PROC_MAX)
830				fatalx("invalid process name");
831			break;
832		case 'I':
833			proc_instance = strtonum(optarg, 0,
834			    PROC_MAX_INSTANCES, &errp);
835			if (errp)
836				fatalx("invalid process instance");
837			break;
838		/* child vm and device fork/exec */
839		case 'p':
840			title = optarg;
841			break;
842		case 'V':
843			vm_launch = VMD_LAUNCH_VM;
844			vm_fd = strtonum(optarg, 0, 128, &errp);
845			if (errp)
846				fatalx("invalid vm fd");
847			break;
848		case 'X':
849			vm_launch = VMD_LAUNCH_DEV;
850			vm_fd = strtonum(optarg, 0, 128, &errp);
851			if (errp)
852				fatalx("invalid device fd");
853			break;
854		case 't':
855			dev_type = *optarg;
856			switch (dev_type) {
857			case VMD_DEVTYPE_NET:
858			case VMD_DEVTYPE_DISK:
859				break;
860			default: fatalx("invalid device type");
861			}
862			break;
863		case 'i':
864			vmm_fd = strtonum(optarg, 0, 128, &errp);
865			if (errp)
866				fatalx("invalid vmm fd");
867			break;
868		default:
869			usage();
870		}
871	}
872
873	argc -= optind;
874	if (argc > 0)
875		usage();
876
877	if (env->vmd_noaction && !env->vmd_debug)
878		env->vmd_debug = 1;
879
880	log_init(env->vmd_debug, LOG_DAEMON);
881	log_setverbose(env->vmd_verbose);
882
883	/* Re-exec from the vmm child process requires an absolute path. */
884	if (proc_id == PROC_PARENT && *argv[0] != '/' && !env->vmd_noaction)
885		fatalx("re-exec requires execution with an absolute path");
886	env->argv0 = argv[0];
887
888	/* check for root privileges */
889	if (env->vmd_noaction == 0 && !vm_launch) {
890		if (geteuid())
891			fatalx("need root privileges");
892	}
893
894	ps = &env->vmd_ps;
895	ps->ps_env = env;
896
897	if (config_init(env) == -1)
898		fatal("failed to initialize configuration");
899
900	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
901		fatal("unknown user %s", VMD_USER);
902
903	/* First proc runs as root without pledge but in default chroot */
904	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
905	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
906
907	/*
908	 * If we're launching a new vm or its device, we short out here.
909	 */
910	if (vm_launch == VMD_LAUNCH_VM) {
911		vm_main(vm_fd, vmm_fd);
912		/* NOTREACHED */
913	} else if (vm_launch == VMD_LAUNCH_DEV) {
914		if (dev_type == VMD_DEVTYPE_NET) {
915			log_procinit("vm/%s/vionet", title);
916			vionet_main(vm_fd, vmm_fd);
917			/* NOTREACHED */
918		} else if (dev_type == VMD_DEVTYPE_DISK) {
919			log_procinit("vm/%s/vioblk", title);
920			vioblk_main(vm_fd, vmm_fd);
921			/* NOTREACHED */
922		}
923		fatalx("unsupported device type '%c'", dev_type);
924	}
925
926	/* Open /dev/vmm early. */
927	if (env->vmd_noaction == 0 && proc_id == PROC_PARENT) {
928		env->vmd_fd = open(VMM_NODE, O_RDWR | O_CLOEXEC);
929		if (env->vmd_fd == -1)
930			fatal("%s", VMM_NODE);
931	}
932
933	/* Configure the control socket */
934	ps->ps_csock.cs_name = SOCKET_NAME;
935	TAILQ_INIT(&ps->ps_rcsocks);
936
937	/* Configuration will be parsed after forking the children */
938	env->vmd_conffile = conffile;
939
940	if (env->vmd_noaction)
941		ps->ps_noaction = 1;
942	ps->ps_instance = proc_instance;
943	if (title != NULL)
944		ps->ps_title[proc_id] = title;
945
946	/* only the parent returns */
947	proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv,
948	    proc_id);
949
950	if (ps->ps_noaction == 0)
951		log_info("startup");
952
953	event_init();
954
955	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
956	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
957	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
958	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
959	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
960
961	signal_add(&ps->ps_evsigint, NULL);
962	signal_add(&ps->ps_evsigterm, NULL);
963	signal_add(&ps->ps_evsighup, NULL);
964	signal_add(&ps->ps_evsigpipe, NULL);
965	signal_add(&ps->ps_evsigusr1, NULL);
966
967	if (!env->vmd_noaction)
968		proc_connect(ps);
969
970	if (vmd_configure() == -1)
971		fatalx("configuration failed");
972
973	event_dispatch();
974
975	log_debug("exiting");
976
977	return (0);
978}
979
980void
981start_vm_batch(int fd, short type, void *args)
982{
983	int		i = 0;
984	struct vmd_vm	*vm;
985
986	log_debug("%s: starting batch of %d vms", __func__,
987	    env->vmd_cfg.parallelism);
988	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
989		if (!(vm->vm_state & VM_STATE_WAITING)) {
990			log_debug("%s: not starting vm %s (disabled)",
991			    __func__,
992			    vm->vm_params.vmc_params.vcp_name);
993			continue;
994		}
995		i++;
996		if (i > env->vmd_cfg.parallelism) {
997			evtimer_add(&staggered_start_timer,
998			    &env->vmd_cfg.delay);
999			break;
1000		}
1001		vm->vm_state &= ~VM_STATE_WAITING;
1002		config_setvm(&env->vmd_ps, vm, -1, vm->vm_params.vmc_owner.uid);
1003	}
1004	log_debug("%s: done starting vms", __func__);
1005}
1006
1007int
1008vmd_configure(void)
1009{
1010	int			ncpus;
1011	struct vmd_switch	*vsw;
1012	int ncpu_mib[] = {CTL_HW, HW_NCPUONLINE};
1013	size_t ncpus_sz = sizeof(ncpus);
1014
1015	/*
1016	 * pledge in the parent process:
1017	 * stdio - for malloc and basic I/O including events.
1018	 * rpath - for reload to open and read the configuration files.
1019	 * wpath - for opening disk images and tap devices.
1020	 * tty - for openpty and TIOCUCNTL.
1021	 * proc - run kill to terminate its children safely.
1022	 * sendfd - for disks, interfaces and other fds.
1023	 * recvfd - for send and receive.
1024	 * getpw - lookup user or group id by name.
1025	 * chown, fattr - change tty ownership
1026	 * flock - locking disk files
1027	 */
1028	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
1029	    " chown fattr flock", NULL) == -1)
1030		fatal("pledge");
1031
1032	if ((env->vmd_ptmfd = getptmfd()) == -1)
1033		fatal("getptmfd %s", PATH_PTMDEV);
1034
1035	if (parse_config(env->vmd_conffile) == -1) {
1036		proc_kill(&env->vmd_ps);
1037		exit(1);
1038	}
1039
1040	if (env->vmd_noaction) {
1041		fprintf(stderr, "configuration OK\n");
1042		proc_kill(&env->vmd_ps);
1043		exit(0);
1044	}
1045
1046	/* Send VMM device fd to vmm proc. */
1047	proc_compose_imsg(&env->vmd_ps, PROC_VMM, -1,
1048	    IMSG_VMDOP_RECEIVE_VMM_FD, -1, env->vmd_fd, NULL, 0);
1049
1050	/* Send shared global configuration to all children */
1051	if (config_setconfig(env) == -1)
1052		return (-1);
1053
1054	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1055		if (vsw->sw_running)
1056			continue;
1057		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1058			log_warn("%s: failed to create switch %s",
1059			    __func__, vsw->sw_name);
1060			switch_remove(vsw);
1061			return (-1);
1062		}
1063	}
1064
1065	if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) {
1066		env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY;
1067		if (sysctl(ncpu_mib, nitems(ncpu_mib), &ncpus, &ncpus_sz, NULL, 0) == -1)
1068			ncpus = 1;
1069		env->vmd_cfg.parallelism = ncpus;
1070		log_debug("%s: setting staggered start configuration to "
1071		    "parallelism: %d and delay: %lld",
1072		    __func__, ncpus, (long long) env->vmd_cfg.delay.tv_sec);
1073	}
1074
1075	log_debug("%s: starting vms in staggered fashion", __func__);
1076	evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1077	/* start first batch */
1078	start_vm_batch(0, 0, NULL);
1079
1080	return (0);
1081}
1082
1083int
1084vmd_reload(unsigned int reset, const char *filename)
1085{
1086	struct vmd_vm		*vm, *next_vm;
1087	struct vmd_switch	*vsw;
1088	int			 reload = 0;
1089
1090	/* Switch back to the default config file */
1091	if (filename == NULL || *filename == '\0') {
1092		filename = env->vmd_conffile;
1093		reload = 1;
1094	}
1095
1096	log_debug("%s: level %d config file %s", __func__, reset, filename);
1097
1098	if (reset) {
1099		/* Purge the configuration */
1100		config_purge(env, reset);
1101		config_setreset(env, reset);
1102	} else {
1103		/*
1104		 * Load or reload the configuration.
1105		 *
1106		 * Reloading removes all non-running VMs before processing the
1107		 * config file, whereas loading only adds to the existing list
1108		 * of VMs.
1109		 */
1110
1111		if (reload) {
1112			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
1113			    next_vm) {
1114				if (!(vm->vm_state & VM_STATE_RUNNING)) {
1115					DPRINTF("%s: calling vm_remove",
1116					    __func__);
1117					vm_remove(vm, __func__);
1118				}
1119			}
1120		}
1121
1122		if (parse_config(filename) == -1) {
1123			log_debug("%s: failed to load config file %s",
1124			    __func__, filename);
1125			return (-1);
1126		}
1127
1128		if (reload) {
1129			/* Update shared global configuration in all children */
1130			if (config_setconfig(env) == -1)
1131				return (-1);
1132		}
1133
1134		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1135			if (vsw->sw_running)
1136				continue;
1137			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1138				log_warn("%s: failed to create switch %s",
1139				    __func__, vsw->sw_name);
1140				switch_remove(vsw);
1141				return (-1);
1142			}
1143		}
1144
1145		log_debug("%s: starting vms in staggered fashion", __func__);
1146		evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1147		/* start first batch */
1148		start_vm_batch(0, 0, NULL);
1149
1150		}
1151
1152	return (0);
1153}
1154
1155void
1156vmd_shutdown(void)
1157{
1158	struct vmd_vm *vm, *vm_next;
1159
1160	log_debug("%s: performing shutdown", __func__);
1161
1162	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
1163		vm_remove(vm, __func__);
1164	}
1165
1166	proc_kill(&env->vmd_ps);
1167	free(env);
1168
1169	log_warnx("terminating");
1170	exit(0);
1171}
1172
1173struct vmd_vm *
1174vm_getbyvmid(uint32_t vmid)
1175{
1176	struct vmd_vm	*vm;
1177
1178	if (vmid == 0)
1179		return (NULL);
1180	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1181		if (vm->vm_vmid == vmid)
1182			return (vm);
1183	}
1184
1185	return (NULL);
1186}
1187
1188struct vmd_vm *
1189vm_getbyid(uint32_t id)
1190{
1191	struct vmd_vm	*vm;
1192
1193	if (id == 0)
1194		return (NULL);
1195	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1196		if (vm->vm_params.vmc_params.vcp_id == id)
1197			return (vm);
1198	}
1199
1200	return (NULL);
1201}
1202
1203uint32_t
1204vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1205{
1206	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1207		return (0);
1208	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1209	    id, vm->vm_vmid);
1210	return (vm->vm_vmid);
1211}
1212
1213uint32_t
1214vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1215{
1216	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1217		return (0);
1218	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1219	    vmid, vm->vm_params.vmc_params.vcp_id);
1220	return (vm->vm_params.vmc_params.vcp_id);
1221}
1222
1223struct vmd_vm *
1224vm_getbyname(const char *name)
1225{
1226	struct vmd_vm	*vm;
1227
1228	if (name == NULL)
1229		return (NULL);
1230	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1231		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1232			return (vm);
1233	}
1234
1235	return (NULL);
1236}
1237
1238struct vmd_vm *
1239vm_getbypid(pid_t pid)
1240{
1241	struct vmd_vm	*vm;
1242
1243	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1244		if (vm->vm_pid == pid)
1245			return (vm);
1246	}
1247
1248	return (NULL);
1249}
1250
1251void
1252vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1253{
1254	struct privsep	*ps = &env->vmd_ps;
1255	unsigned int	 i, j;
1256
1257	if (vm == NULL)
1258		return;
1259
1260	log_debug("%s: %s %s stopping vm %d%s",
1261	    __func__, ps->ps_title[privsep_process], caller,
1262	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1263
1264	vm->vm_state &= ~(VM_STATE_RECEIVED | VM_STATE_RUNNING
1265	    | VM_STATE_SHUTDOWN);
1266
1267	if (vm->vm_iev.ibuf.fd != -1) {
1268		event_del(&vm->vm_iev.ev);
1269		close(vm->vm_iev.ibuf.fd);
1270	}
1271	for (i = 0; i < VM_MAX_DISKS_PER_VM; i++) {
1272		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
1273			if (vm->vm_disks[i][j] != -1) {
1274				close(vm->vm_disks[i][j]);
1275				vm->vm_disks[i][j] = -1;
1276			}
1277		}
1278	}
1279	for (i = 0; i < VM_MAX_NICS_PER_VM; i++) {
1280		if (vm->vm_ifs[i].vif_fd != -1) {
1281			close(vm->vm_ifs[i].vif_fd);
1282			vm->vm_ifs[i].vif_fd = -1;
1283		}
1284		free(vm->vm_ifs[i].vif_name);
1285		free(vm->vm_ifs[i].vif_switch);
1286		free(vm->vm_ifs[i].vif_group);
1287		vm->vm_ifs[i].vif_name = NULL;
1288		vm->vm_ifs[i].vif_switch = NULL;
1289		vm->vm_ifs[i].vif_group = NULL;
1290	}
1291	if (vm->vm_kernel != -1) {
1292		close(vm->vm_kernel);
1293		vm->vm_kernel = -1;
1294	}
1295	if (vm->vm_cdrom != -1) {
1296		close(vm->vm_cdrom);
1297		vm->vm_cdrom = -1;
1298	}
1299	if (!keeptty) {
1300		vm_closetty(vm);
1301		vm->vm_uid = 0;
1302	}
1303}
1304
1305void
1306vm_remove(struct vmd_vm *vm, const char *caller)
1307{
1308	struct privsep	*ps = &env->vmd_ps;
1309
1310	if (vm == NULL)
1311		return;
1312
1313	log_debug("%s: %s %s removing vm %d from running config",
1314	    __func__, ps->ps_title[privsep_process], caller,
1315	    vm->vm_vmid);
1316
1317	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1318
1319	vm_stop(vm, 0, caller);
1320	if (vm->vm_kernel_path != NULL && !vm->vm_from_config)
1321		free(vm->vm_kernel_path);
1322	free(vm);
1323}
1324
1325int
1326vm_claimid(const char *name, int uid, uint32_t *id)
1327{
1328	struct name2id *n2i = NULL;
1329
1330	TAILQ_FOREACH(n2i, env->vmd_known, entry)
1331		if (strcmp(n2i->name, name) == 0 && n2i->uid == uid)
1332			goto out;
1333
1334	if (++env->vmd_nvm == 0) {
1335		log_warnx("too many vms");
1336		return (-1);
1337	}
1338	if ((n2i = calloc(1, sizeof(struct name2id))) == NULL) {
1339		log_warnx("could not alloc vm name");
1340		return (-1);
1341	}
1342	n2i->id = env->vmd_nvm;
1343	n2i->uid = uid;
1344	if (strlcpy(n2i->name, name, sizeof(n2i->name)) >= sizeof(n2i->name)) {
1345		log_warnx("vm name too long");
1346		free(n2i);
1347		return (-1);
1348	}
1349	TAILQ_INSERT_TAIL(env->vmd_known, n2i, entry);
1350
1351out:
1352	*id = n2i->id;
1353	return (0);
1354}
1355
1356int
1357vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1358    struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1359{
1360	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1361	struct vm_create_params	*vcp = &vmc->vmc_params;
1362	struct vmop_owner	*vmo = NULL;
1363	uint32_t		 nid, rng;
1364	unsigned int		 i, j;
1365	struct vmd_switch	*sw;
1366	char			*s;
1367	int			 ret = 0;
1368
1369	/* Check if this is an instance of another VM */
1370	if ((ret = vm_instance(ps, &vm_parent, vmc, uid)) != 0) {
1371		errno = ret; /* XXX might set invalid errno */
1372		return (-1);
1373	}
1374
1375	errno = 0;
1376	*ret_vm = NULL;
1377
1378	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1379	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1380		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1381		    uid) != 0) {
1382			errno = EPERM;
1383			goto fail;
1384		}
1385		vm->vm_kernel = vmc->vmc_kernel;
1386		*ret_vm = vm;
1387		errno = EALREADY;
1388		goto fail;
1389	}
1390
1391	if (vm_parent != NULL)
1392		vmo = &vm_parent->vm_params.vmc_insowner;
1393
1394	/* non-root users can only start existing VMs or instances */
1395	if (vm_checkperm(NULL, vmo, uid) != 0) {
1396		log_warnx("permission denied");
1397		errno = EPERM;
1398		goto fail;
1399	}
1400	if (vmc->vmc_flags == 0) {
1401		log_warnx("invalid configuration, no devices");
1402		errno = VMD_DISK_MISSING;
1403		goto fail;
1404	}
1405	if (vcp->vcp_ncpus == 0)
1406		vcp->vcp_ncpus = 1;
1407	if (vcp->vcp_memranges[0].vmr_size == 0)
1408		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1409	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1410		log_warnx("invalid number of CPUs");
1411		goto fail;
1412	} else if (vmc->vmc_ndisks > VM_MAX_DISKS_PER_VM) {
1413		log_warnx("invalid number of disks");
1414		goto fail;
1415	} else if (vmc->vmc_nnics > VM_MAX_NICS_PER_VM) {
1416		log_warnx("invalid number of interfaces");
1417		goto fail;
1418	} else if (vmc->vmc_kernel == -1 && vmc->vmc_ndisks == 0
1419	    && strlen(vmc->vmc_cdrom) == 0) {
1420		log_warnx("no kernel or disk/cdrom specified");
1421		goto fail;
1422	} else if (strlen(vcp->vcp_name) == 0) {
1423		log_warnx("invalid VM name");
1424		goto fail;
1425	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1426	    *vcp->vcp_name == '_') {
1427		log_warnx("invalid VM name");
1428		goto fail;
1429	} else {
1430		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1431			if (!(isalnum((unsigned char)*s) || *s == '.' || \
1432			    *s == '-' || *s == '_')) {
1433				log_warnx("invalid VM name");
1434				goto fail;
1435			}
1436		}
1437	}
1438
1439	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1440		goto fail;
1441
1442	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1443	vmc = &vm->vm_params;
1444	vcp = &vmc->vmc_params;
1445	vm->vm_pid = -1;
1446	vm->vm_tty = -1;
1447	vm->vm_receive_fd = -1;
1448	vm->vm_kernel = -1;
1449	vm->vm_state &= ~VM_STATE_PAUSED;
1450
1451	if (vmc->vmc_kernel > -1)
1452		vm->vm_kernel = vmc->vmc_kernel;
1453
1454	for (i = 0; i < VM_MAX_DISKS_PER_VM; i++)
1455		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
1456			vm->vm_disks[i][j] = -1;
1457	for (i = 0; i < VM_MAX_NICS_PER_VM; i++)
1458		vm->vm_ifs[i].vif_fd = -1;
1459	for (i = 0; i < vmc->vmc_nnics; i++) {
1460		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1461			/* inherit per-interface flags from the switch */
1462			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1463		}
1464
1465		/*
1466		 * If the MAC address is zero, always randomize it in vmd(8)
1467		 * because we cannot rely on the guest OS to do the right
1468		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1469		 * from the kernel, incremented by one to differentiate
1470		 * the source.
1471		 */
1472		if (memcmp(zero_mac, &vmc->vmc_macs[i], ETHER_ADDR_LEN) == 0) {
1473			rng = arc4random();
1474			vmc->vmc_macs[i][0] = 0xfe;
1475			vmc->vmc_macs[i][1] = 0xe1;
1476			vmc->vmc_macs[i][2] = 0xba + 1;
1477			vmc->vmc_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1478			vmc->vmc_macs[i][4] = rng;
1479			vmc->vmc_macs[i][5] = rng >> 8;
1480		}
1481	}
1482	vm->vm_cdrom = -1;
1483	vm->vm_iev.ibuf.fd = -1;
1484
1485	/*
1486	 * Assign a new internal Id if not specified and we succeed in
1487	 * claiming a new Id.
1488	 */
1489	if (id != 0)
1490		vm->vm_vmid = id;
1491	else if (vm_claimid(vcp->vcp_name, uid, &nid) == -1)
1492		goto fail;
1493	else
1494		vm->vm_vmid = nid;
1495
1496	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1497	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1498
1499	*ret_vm = vm;
1500	return (0);
1501 fail:
1502	if (errno == 0)
1503		errno = EINVAL;
1504	return (-1);
1505}
1506
1507int
1508vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1509    struct vmop_create_params *vmc, uid_t uid)
1510{
1511	char			*name;
1512	struct vm_create_params	*vcp = &vmc->vmc_params;
1513	struct vmop_create_params *vmcp;
1514	struct vm_create_params	*vcpp;
1515	unsigned int		 i, j;
1516
1517	/* return without error if the parent is NULL (nothing to inherit) */
1518	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1519	    vmc->vmc_instance[0] == '\0')
1520		return (0);
1521
1522	if ((*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL) {
1523		return (VMD_PARENT_INVALID);
1524	}
1525
1526	vmcp = &(*vm_parent)->vm_params;
1527	vcpp = &vmcp->vmc_params;
1528
1529	/* Are we allowed to create an instance from this VM? */
1530	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1531		log_warnx("vm \"%s\" no permission to create vm instance",
1532		    vcpp->vcp_name);
1533		return (ENAMETOOLONG);
1534	}
1535
1536	name = vcp->vcp_name;
1537
1538	if (vm_getbyname(vcp->vcp_name) != NULL ||
1539	    vm_getbyvmid(vcp->vcp_id) != NULL) {
1540		return (EPROCLIM);
1541	}
1542
1543	/* CPU */
1544	if (vcp->vcp_ncpus == 0)
1545		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1546	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1547	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1548		log_warnx("vm \"%s\" no permission to set cpus", name);
1549		return (EPERM);
1550	}
1551
1552	/* memory */
1553	if (vcp->vcp_memranges[0].vmr_size == 0)
1554		vcp->vcp_memranges[0].vmr_size =
1555		    vcpp->vcp_memranges[0].vmr_size;
1556	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1557	    vcp->vcp_memranges[0].vmr_size !=
1558	    vcpp->vcp_memranges[0].vmr_size) {
1559		log_warnx("vm \"%s\" no permission to set memory", name);
1560		return (EPERM);
1561	}
1562
1563	/* disks cannot be inherited */
1564	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1565	    vmc->vmc_ndisks) {
1566		log_warnx("vm \"%s\" no permission to set disks", name);
1567		return (EPERM);
1568	}
1569	for (i = 0; i < vmc->vmc_ndisks; i++) {
1570		/* Check if this disk is already used in the parent */
1571		for (j = 0; j < vmcp->vmc_ndisks; j++) {
1572			if (strcmp(vmc->vmc_disks[i],
1573			    vmcp->vmc_disks[j]) == 0) {
1574				log_warnx("vm \"%s\" disk %s cannot be reused",
1575				    name, vmc->vmc_disks[i]);
1576				return (EBUSY);
1577			}
1578		}
1579		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1580	}
1581
1582	/* interfaces */
1583	if (vmc->vmc_nnics > 0 &&
1584	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1585	    vmc->vmc_nnics != vmcp->vmc_nnics) {
1586		log_warnx("vm \"%s\" no permission to set interfaces", name);
1587		return (EPERM);
1588	}
1589	for (i = 0; i < vmcp->vmc_nnics; i++) {
1590		/* Interface got overwritten */
1591		if (i < vmc->vmc_nnics)
1592			continue;
1593
1594		/* Copy interface from parent */
1595		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1596		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1597		    sizeof(vmc->vmc_ifnames[i]));
1598		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1599		    sizeof(vmc->vmc_ifswitch[i]));
1600		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1601		    sizeof(vmc->vmc_ifgroup[i]));
1602		memcpy(vmc->vmc_macs[i], vmcp->vmc_macs[i],
1603		    sizeof(vmc->vmc_macs[i]));
1604		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1605		vmc->vmc_nnics++;
1606	}
1607	for (i = 0; i < vmc->vmc_nnics; i++) {
1608		for (j = 0; j < vmcp->vmc_nnics; j++) {
1609			if (memcmp(zero_mac, vmc->vmc_macs[i],
1610			    sizeof(vmc->vmc_macs[i])) != 0 &&
1611			    memcmp(vmcp->vmc_macs[i], vmc->vmc_macs[i],
1612			    sizeof(vmc->vmc_macs[i])) != 0) {
1613				log_warnx("vm \"%s\" lladdr cannot be reused",
1614				    name);
1615				return (EBUSY);
1616			}
1617			if (strlen(vmc->vmc_ifnames[i]) &&
1618			    strcmp(vmc->vmc_ifnames[i],
1619			    vmcp->vmc_ifnames[j]) == 0) {
1620				log_warnx("vm \"%s\" %s cannot be reused",
1621				    vmc->vmc_ifnames[i], name);
1622				return (EBUSY);
1623			}
1624		}
1625	}
1626
1627	/* kernel */
1628	if (vmc->vmc_kernel > -1 || ((*vm_parent)->vm_kernel_path != NULL &&
1629		strnlen((*vm_parent)->vm_kernel_path, PATH_MAX) < PATH_MAX)) {
1630		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1631			log_warnx("vm \"%s\" no permission to set boot image",
1632			    name);
1633			return (EPERM);
1634		}
1635		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1636	}
1637
1638	/* cdrom */
1639	if (strlen(vmc->vmc_cdrom) > 0) {
1640		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1641			log_warnx("vm \"%s\" no permission to set cdrom", name);
1642			return (EPERM);
1643		}
1644		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1645	} else if (strlcpy(vmc->vmc_cdrom, vmcp->vmc_cdrom,
1646	    sizeof(vmc->vmc_cdrom)) >= sizeof(vmc->vmc_cdrom)) {
1647		log_warnx("vm \"%s\" cdrom name too long", name);
1648		return (EINVAL);
1649	}
1650
1651	/* user */
1652	if (vmc->vmc_owner.uid == 0)
1653		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1654	else if (vmc->vmc_owner.uid != uid &&
1655	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1656		log_warnx("vm \"%s\" user mismatch", name);
1657		return (EPERM);
1658	}
1659
1660	/* group */
1661	if (vmc->vmc_owner.gid == 0)
1662		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1663	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1664		log_warnx("vm \"%s\" group mismatch", name);
1665		return (EPERM);
1666	}
1667
1668	/* child instances */
1669	if (vmc->vmc_insflags) {
1670		log_warnx("vm \"%s\" cannot change instance permissions", name);
1671		return (EPERM);
1672	}
1673	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1674		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1675		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1676		vmc->vmc_insflags = vmcp->vmc_insflags;
1677	} else {
1678		vmc->vmc_insowner.gid = 0;
1679		vmc->vmc_insowner.uid = 0;
1680		vmc->vmc_insflags = 0;
1681	}
1682
1683	/* finished, remove instance flags */
1684	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1685
1686	return (0);
1687}
1688
1689/*
1690 * vm_checkperm
1691 *
1692 * Checks if the user represented by the 'uid' parameter is allowed to
1693 * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1694 * console.)
1695 *
1696 * Parameters:
1697 *  vm: the VM whose permission is to be checked
1698 *  vmo: the required uid/gid to be checked
1699 *  uid: the user ID of the user making the request
1700 *
1701 * Return values:
1702 *   0: the permission should be granted
1703 *  -1: the permission check failed (also returned if vm == null)
1704 */
1705int
1706vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1707{
1708	struct group	*gr;
1709	struct passwd	*pw;
1710	char		**grmem;
1711
1712	/* root has no restrictions */
1713	if (uid == 0)
1714		return (0);
1715
1716	if (vmo == NULL)
1717		return (-1);
1718
1719	/* check user */
1720	if (vm == NULL) {
1721		if  (vmo->uid == uid)
1722			return (0);
1723	} else {
1724		/*
1725		 * check user of running vm (the owner of a running vm can
1726		 * be different to (or more specific than) the configured owner.
1727		 */
1728		if (((vm->vm_state & VM_STATE_RUNNING) && vm->vm_uid == uid) ||
1729		    (!(vm->vm_state & VM_STATE_RUNNING) && vmo->uid == uid))
1730			return (0);
1731	}
1732
1733	/* check groups */
1734	if (vmo->gid != -1) {
1735		if ((pw = getpwuid(uid)) == NULL)
1736			return (-1);
1737		if (pw->pw_gid == vmo->gid)
1738			return (0);
1739		if ((gr = getgrgid(vmo->gid)) != NULL) {
1740			for (grmem = gr->gr_mem; *grmem; grmem++)
1741				if (strcmp(*grmem, pw->pw_name) == 0)
1742					return (0);
1743		}
1744	}
1745
1746	return (-1);
1747}
1748
1749/*
1750 * vm_checkinsflag
1751 *
1752 * Checks whether the non-root user is allowed to set an instance option.
1753 *
1754 * Parameters:
1755 *  vmc: the VM create parameters
1756 *  flag: the flag to be checked
1757 *  uid: the user ID of the user making the request
1758 *
1759 * Return values:
1760 *   0: the permission should be granted
1761 *  -1: the permission check failed (also returned if vm == null)
1762 */
1763int
1764vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1765{
1766	/* root has no restrictions */
1767	if (uid == 0)
1768		return (0);
1769
1770	if ((vmc->vmc_insflags & flag) == 0)
1771		return (-1);
1772
1773	return (0);
1774}
1775
1776/*
1777 * vm_checkaccess
1778 *
1779 * Checks if the user represented by the 'uid' parameter is allowed to
1780 * access the file described by the 'path' parameter.
1781 *
1782 * Parameters:
1783 *  fd: the file descriptor of the opened file
1784 *  uflag: check if the userid has access to the file
1785 *  uid: the user ID of the user making the request
1786 *  amode: the access flags of R_OK and W_OK
1787 *
1788 * Return values:
1789 *   0: the permission should be granted
1790 *  -1: the permission check failed
1791 */
1792int
1793vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1794{
1795	struct group	*gr;
1796	struct passwd	*pw;
1797	char		**grmem;
1798	struct stat	 st;
1799	mode_t		 mode;
1800
1801	if (fd == -1)
1802		return (-1);
1803
1804	/*
1805	 * File has to be accessible and a regular file
1806	 */
1807	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1808		return (-1);
1809
1810	/* root has no restrictions */
1811	if (uid == 0 || uflag == 0)
1812		return (0);
1813
1814	/* check other */
1815	mode = amode & W_OK ? S_IWOTH : 0;
1816	mode |= amode & R_OK ? S_IROTH : 0;
1817	if ((st.st_mode & mode) == mode)
1818		return (0);
1819
1820	/* check user */
1821	mode = amode & W_OK ? S_IWUSR : 0;
1822	mode |= amode & R_OK ? S_IRUSR : 0;
1823	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1824		return (0);
1825
1826	/* check groups */
1827	mode = amode & W_OK ? S_IWGRP : 0;
1828	mode |= amode & R_OK ? S_IRGRP : 0;
1829	if ((st.st_mode & mode) != mode)
1830		return (-1);
1831	if ((pw = getpwuid(uid)) == NULL)
1832		return (-1);
1833	if (pw->pw_gid == st.st_gid)
1834		return (0);
1835	if ((gr = getgrgid(st.st_gid)) != NULL) {
1836		for (grmem = gr->gr_mem; *grmem; grmem++)
1837			if (strcmp(*grmem, pw->pw_name) == 0)
1838				return (0);
1839	}
1840
1841	return (-1);
1842}
1843
1844int
1845vm_opentty(struct vmd_vm *vm)
1846{
1847	struct stat		 st;
1848	struct group		*gr;
1849	uid_t			 uid;
1850	gid_t			 gid;
1851	mode_t			 mode;
1852	int			 on = 1, tty_slave;
1853
1854	/*
1855	 * Open tty with pre-opened PTM fd
1856	 */
1857	if (fdopenpty(env->vmd_ptmfd, &vm->vm_tty, &tty_slave, vm->vm_ttyname,
1858	    NULL, NULL) == -1) {
1859		log_warn("fdopenpty");
1860		return (-1);
1861	}
1862	close(tty_slave);
1863
1864	/*
1865	 * We use user ioctl(2) mode to pass break commands.
1866	 */
1867	if (ioctl(vm->vm_tty, TIOCUCNTL, &on) == -1) {
1868		log_warn("could not enable user ioctl mode on %s",
1869		    vm->vm_ttyname);
1870		goto fail;
1871	}
1872
1873	uid = vm->vm_uid;
1874	gid = vm->vm_params.vmc_owner.gid;
1875
1876	if (vm->vm_params.vmc_owner.gid != -1) {
1877		mode = 0660;
1878	} else if ((gr = getgrnam("tty")) != NULL) {
1879		gid = gr->gr_gid;
1880		mode = 0620;
1881	} else {
1882		mode = 0600;
1883		gid = 0;
1884	}
1885
1886	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1887	    __func__, vm->vm_params.vmc_params.vcp_name,
1888	    vm->vm_ttyname, uid, gid, mode);
1889
1890	/*
1891	 * Change ownership and mode of the tty as required.
1892	 * Loosely based on the implementation of sshpty.c
1893	 */
1894	if (fstat(vm->vm_tty, &st) == -1) {
1895		log_warn("fstat failed for %s", vm->vm_ttyname);
1896		goto fail;
1897	}
1898
1899	if (st.st_uid != uid || st.st_gid != gid) {
1900		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1901			log_warn("chown %s %d %d failed, uid %d",
1902			    vm->vm_ttyname, uid, gid, getuid());
1903
1904			/* Ignore failure on read-only filesystems */
1905			if (!((errno == EROFS) &&
1906			    (st.st_uid == uid || st.st_uid == 0)))
1907				goto fail;
1908		}
1909	}
1910
1911	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1912		if (chmod(vm->vm_ttyname, mode) == -1) {
1913			log_warn("chmod %s %o failed, uid %d",
1914			    vm->vm_ttyname, mode, getuid());
1915
1916			/* Ignore failure on read-only filesystems */
1917			if (!((errno == EROFS) &&
1918			    (st.st_uid == uid || st.st_uid == 0)))
1919				goto fail;
1920		}
1921	}
1922
1923	return (0);
1924 fail:
1925	vm_closetty(vm);
1926	return (-1);
1927}
1928
1929void
1930vm_closetty(struct vmd_vm *vm)
1931{
1932	if (vm->vm_tty != -1) {
1933		/* Release and close the tty */
1934		if (fchown(vm->vm_tty, 0, 0) == -1)
1935			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1936		if (fchmod(vm->vm_tty, 0666) == -1)
1937			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1938		close(vm->vm_tty);
1939		vm->vm_tty = -1;
1940	}
1941	memset(&vm->vm_ttyname, 0, sizeof(vm->vm_ttyname));
1942}
1943
1944void
1945switch_remove(struct vmd_switch *vsw)
1946{
1947	if (vsw == NULL)
1948		return;
1949
1950	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1951
1952	free(vsw->sw_group);
1953	free(vsw->sw_name);
1954	free(vsw);
1955}
1956
1957struct vmd_switch *
1958switch_getbyname(const char *name)
1959{
1960	struct vmd_switch	*vsw;
1961
1962	if (name == NULL)
1963		return (NULL);
1964	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1965		if (strcmp(vsw->sw_name, name) == 0)
1966			return (vsw);
1967	}
1968
1969	return (NULL);
1970}
1971
1972char *
1973get_string(uint8_t *ptr, size_t len)
1974{
1975	size_t	 i;
1976
1977	for (i = 0; i < len; i++)
1978		if (!isprint((unsigned char)ptr[i]))
1979			break;
1980
1981	return strndup(ptr, i);
1982}
1983
1984uint32_t
1985prefixlen2mask(uint8_t prefixlen)
1986{
1987	if (prefixlen == 0)
1988		return (0);
1989
1990	if (prefixlen > 32)
1991		prefixlen = 32;
1992
1993	return (htonl(0xffffffff << (32 - prefixlen)));
1994}
1995
1996void
1997prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask)
1998{
1999	struct in6_addr	 s6;
2000	int		 i;
2001
2002	if (prefixlen > 128)
2003		prefixlen = 128;
2004
2005	memset(&s6, 0, sizeof(s6));
2006	for (i = 0; i < prefixlen / 8; i++)
2007		s6.s6_addr[i] = 0xff;
2008	i = prefixlen % 8;
2009	if (i)
2010		s6.s6_addr[prefixlen / 8] = 0xff00 >> i;
2011
2012	memcpy(mask, &s6, sizeof(s6));
2013}
2014
2015void
2016getmonotime(struct timeval *tv)
2017{
2018	struct timespec	 ts;
2019
2020	if (clock_gettime(CLOCK_MONOTONIC, &ts))
2021		fatal("clock_gettime");
2022
2023	TIMESPEC_TO_TIMEVAL(tv, &ts);
2024}
2025
2026static inline void
2027vm_terminate(struct vmd_vm *vm, const char *caller)
2028{
2029	if (vm->vm_from_config)
2030		vm_stop(vm, 0, caller);
2031	else {
2032		/* vm_remove calls vm_stop */
2033		vm_remove(vm, caller);
2034	}
2035}
2036
2037/*
2038 * Utility function for closing vm file descriptors. Assumes an fd of -1 was
2039 * already closed or never opened.
2040 *
2041 * Returns 0 on success, otherwise -1 on failure.
2042 */
2043int
2044close_fd(int fd)
2045{
2046	int	ret;
2047
2048	if (fd == -1)
2049		return (0);
2050
2051#ifdef POSIX_CLOSE_RESTART
2052	do { ret = close(fd); } while (ret == -1 && errno == EINTR);
2053#else
2054	ret = close(fd);
2055#endif /* POSIX_CLOSE_RESTART */
2056
2057	if (ret == -1 && errno == EIO)
2058		log_warn("%s(%d)", __func__, fd);
2059
2060	return (ret);
2061}
2062