vmd.c revision 1.157
1/*	$OpenBSD: vmd.c,v 1.157 2024/05/18 06:45:00 jsg Exp $	*/
2
3/*
4 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/types.h>
20#include <sys/queue.h>
21#include <sys/wait.h>
22#include <sys/stat.h>
23#include <sys/sysctl.h>
24#include <sys/tty.h>
25#include <sys/ttycom.h>
26#include <sys/ioctl.h>
27
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31#include <termios.h>
32#include <errno.h>
33#include <event.h>
34#include <fcntl.h>
35#include <pwd.h>
36#include <signal.h>
37#include <syslog.h>
38#include <unistd.h>
39#include <util.h>
40#include <ctype.h>
41#include <grp.h>
42
43#include <machine/specialreg.h>
44#include <machine/vmmvar.h>
45
46#include "proc.h"
47#include "atomicio.h"
48#include "vmd.h"
49
50__dead void usage(void);
51
52int	 main(int, char **);
53int	 vmd_configure(void);
54void	 vmd_sighdlr(int sig, short event, void *arg);
55void	 vmd_shutdown(void);
56int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
57int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
58int	 vmd_dispatch_agentx(int, struct privsep_proc *, struct imsg *);
59int	 vmd_dispatch_priv(int, struct privsep_proc *, struct imsg *);
60int	 vmd_check_vmh(struct vm_dump_header *);
61
62int	 vm_instance(struct privsep *, struct vmd_vm **,
63	    struct vmop_create_params *, uid_t);
64int	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
65int	 vm_claimid(const char *, int, uint32_t *);
66void	 start_vm_batch(int, short, void*);
67
68static inline void vm_terminate(struct vmd_vm *, const char *);
69
70struct vmd	*env;
71
72static struct privsep_proc procs[] = {
73	/* Keep "priv" on top as procs[0] */
74	{ "priv",	PROC_PRIV,	vmd_dispatch_priv, priv },
75	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
76	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm,
77	  vmm_shutdown, "/" },
78	{ "agentx", 	PROC_AGENTX,	vmd_dispatch_agentx, vm_agentx,
79	  vm_agentx_shutdown, "/" }
80};
81
82enum privsep_procid privsep_process;
83
84struct event staggered_start_timer;
85
86/* For the privileged process */
87static struct privsep_proc *proc_priv = &procs[0];
88static struct passwd proc_privpw;
89static const uint8_t zero_mac[ETHER_ADDR_LEN];
90
91const char		 default_conffile[] = VMD_CONF;
92const char		*conffile = default_conffile;
93
94int
95vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
96{
97	struct privsep			*ps = p->p_ps;
98	int				 res = 0, ret = 0, cmd = 0, verbose;
99	int				 ifd;
100	unsigned int			 v = 0, flags;
101	struct vmop_create_params	 vmc;
102	struct vmop_id			 vid;
103	struct vmop_result		 vmr;
104	struct vm_dump_header		 vmh;
105	struct vmd_vm			*vm = NULL;
106	char				*str = NULL;
107	uint32_t			 id = 0;
108	struct control_sock		*rcs;
109
110	switch (imsg->hdr.type) {
111	case IMSG_VMDOP_START_VM_REQUEST:
112		IMSG_SIZE_CHECK(imsg, &vmc);
113		memcpy(&vmc, imsg->data, sizeof(vmc));
114		vmc.vmc_kernel = imsg_get_fd(imsg);
115
116		/* Try registering our VM in our list of known VMs. */
117		if (vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid)) {
118			res = errno;
119
120			/* Did we have a failure during lookup of a parent? */
121			if (vm == NULL) {
122				cmd = IMSG_VMDOP_START_VM_RESPONSE;
123				break;
124			}
125
126			/* Does the VM already exist? */
127			if (res == EALREADY) {
128				/* Is it already running? */
129				if (vm->vm_state & VM_STATE_RUNNING) {
130					cmd = IMSG_VMDOP_START_VM_RESPONSE;
131					break;
132				}
133
134				/* If not running, are our flags ok? */
135				if (vmc.vmc_flags &&
136				    vmc.vmc_flags != VMOP_CREATE_KERNEL) {
137					cmd = IMSG_VMDOP_START_VM_RESPONSE;
138					break;
139				}
140			}
141			res = 0;
142		}
143
144		/* Try to start the launch of the VM. */
145		res = config_setvm(ps, vm, imsg->hdr.peerid,
146		    vm->vm_params.vmc_owner.uid);
147		if (res)
148			cmd = IMSG_VMDOP_START_VM_RESPONSE;
149		break;
150	case IMSG_VMDOP_WAIT_VM_REQUEST:
151	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
152		IMSG_SIZE_CHECK(imsg, &vid);
153		memcpy(&vid, imsg->data, sizeof(vid));
154		flags = vid.vid_flags;
155		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
156
157		if ((id = vid.vid_id) == 0) {
158			/* Lookup vm (id) by name */
159			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
160				res = ENOENT;
161				break;
162			}
163			id = vm->vm_vmid;
164		} else if ((vm = vm_getbyvmid(id)) == NULL) {
165			res = ENOENT;
166			break;
167		}
168
169		/* Validate curent state of vm */
170		if ((vm->vm_state & VM_STATE_SHUTDOWN) &&
171			    (flags & VMOP_FORCE) == 0) {
172				res = EALREADY;
173				break;
174		} else if (!(vm->vm_state & VM_STATE_RUNNING)) {
175			res = EINVAL;
176			break;
177		} else if (vm_checkperm(vm, &vm->vm_params.vmc_owner, vid.vid_uid)) {
178			res = EPERM;
179			break;
180		}
181
182		/* Only relay TERMINATION requests, not WAIT requests */
183		if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_REQUEST) {
184			memset(&vid, 0, sizeof(vid));
185			vid.vid_id = id;
186			vid.vid_flags = flags;
187
188			if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
189				imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
190				return (-1);
191		}
192		break;
193	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
194		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
195		break;
196	case IMSG_VMDOP_LOAD:
197		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
198		str = get_string((uint8_t *)imsg->data,
199		    IMSG_DATA_SIZE(imsg));
200	case IMSG_VMDOP_RELOAD:
201		if (vmd_reload(0, str) == -1)
202			cmd = IMSG_CTL_FAIL;
203		else
204			cmd = IMSG_CTL_OK;
205		free(str);
206		break;
207	case IMSG_CTL_RESET:
208		IMSG_SIZE_CHECK(imsg, &v);
209		memcpy(&v, imsg->data, sizeof(v));
210		if (vmd_reload(v, NULL) == -1)
211			cmd = IMSG_CTL_FAIL;
212		else
213			cmd = IMSG_CTL_OK;
214		break;
215	case IMSG_CTL_VERBOSE:
216		IMSG_SIZE_CHECK(imsg, &verbose);
217		memcpy(&verbose, imsg->data, sizeof(verbose));
218		log_setverbose(verbose);
219
220		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
221		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
222		cmd = IMSG_CTL_OK;
223		break;
224	case IMSG_VMDOP_PAUSE_VM:
225	case IMSG_VMDOP_UNPAUSE_VM:
226		IMSG_SIZE_CHECK(imsg, &vid);
227		memcpy(&vid, imsg->data, sizeof(vid));
228		if (vid.vid_id == 0) {
229			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
230				res = ENOENT;
231				cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
232				    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
233				    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
234				break;
235			} else {
236				vid.vid_id = vm->vm_vmid;
237			}
238		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
239			res = ENOENT;
240			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
241			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
242			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
243			break;
244		}
245		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
246		    vid.vid_uid) != 0) {
247			res = EPERM;
248			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
249			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
250			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
251			break;
252		}
253		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
254		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
255		break;
256	case IMSG_VMDOP_SEND_VM_REQUEST:
257		IMSG_SIZE_CHECK(imsg, &vid);
258		memcpy(&vid, imsg->data, sizeof(vid));
259		id = vid.vid_id;
260		ifd = imsg_get_fd(imsg);
261		if (vid.vid_id == 0) {
262			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
263				res = ENOENT;
264				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
265				close(ifd);
266				break;
267			} else {
268				vid.vid_id = vm->vm_vmid;
269			}
270		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
271			res = ENOENT;
272			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
273			close(ifd);
274			break;
275		}
276		vmr.vmr_id = vid.vid_id;
277		log_debug("%s: sending fd to vmm", __func__);
278		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
279		    imsg->hdr.peerid, ifd, &vid, sizeof(vid));
280		break;
281	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
282		IMSG_SIZE_CHECK(imsg, &vid);
283		memcpy(&vid, imsg->data, sizeof(vid));
284		ifd = imsg_get_fd(imsg);
285		if (ifd == -1) {
286			log_warnx("%s: invalid fd", __func__);
287			return (-1);
288		}
289		if (atomicio(read, ifd, &vmh, sizeof(vmh)) != sizeof(vmh)) {
290			log_warnx("%s: error reading vmh from received vm",
291			    __func__);
292			res = EIO;
293			close(ifd);
294			cmd = IMSG_VMDOP_START_VM_RESPONSE;
295			break;
296		}
297
298		if (vmd_check_vmh(&vmh)) {
299			res = ENOENT;
300			close(ifd);
301			cmd = IMSG_VMDOP_START_VM_RESPONSE;
302			break;
303		}
304		if (atomicio(read, ifd, &vmc, sizeof(vmc)) != sizeof(vmc)) {
305			log_warnx("%s: error reading vmc from received vm",
306			    __func__);
307			res = EIO;
308			close(ifd);
309			cmd = IMSG_VMDOP_START_VM_RESPONSE;
310			break;
311		}
312		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
313		    sizeof(vmc.vmc_params.vcp_name));
314		vmc.vmc_params.vcp_id = 0;
315
316		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
317		if (ret != 0) {
318			res = errno;
319			cmd = IMSG_VMDOP_START_VM_RESPONSE;
320			close(ifd);
321		} else {
322			vm->vm_state |= VM_STATE_RECEIVED;
323			config_setvm(ps, vm, imsg->hdr.peerid,
324			    vmc.vmc_owner.uid);
325			log_debug("%s: sending fd to vmm", __func__);
326			proc_compose_imsg(ps, PROC_VMM, -1,
327			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, ifd,
328			    NULL, 0);
329		}
330		break;
331	case IMSG_VMDOP_DONE:
332		control_reset(&ps->ps_csock);
333		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
334			control_reset(rcs);
335		cmd = 0;
336		break;
337	default:
338		return (-1);
339	}
340
341	switch (cmd) {
342	case 0:
343		break;
344	case IMSG_VMDOP_START_VM_RESPONSE:
345	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
346		memset(&vmr, 0, sizeof(vmr));
347		vmr.vmr_result = res;
348		vmr.vmr_id = id;
349		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
350		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
351			return (-1);
352		break;
353	default:
354		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
355		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
356			return (-1);
357		break;
358	}
359
360	return (0);
361}
362
363int
364vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
365{
366	struct vmop_result	 vmr;
367	struct privsep		*ps = p->p_ps;
368	int			 res = 0;
369	struct vmd_vm		*vm;
370	struct vm_create_params	*vcp;
371	struct vmop_info_result	 vir;
372
373	switch (imsg->hdr.type) {
374	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
375		IMSG_SIZE_CHECK(imsg, &vmr);
376		memcpy(&vmr, imsg->data, sizeof(vmr));
377		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
378			break;
379		proc_compose_imsg(ps, PROC_CONTROL, -1,
380		    imsg->hdr.type, imsg->hdr.peerid, -1,
381		    imsg->data, sizeof(imsg->data));
382		log_info("%s: paused vm %d successfully",
383		    vm->vm_params.vmc_params.vcp_name,
384		    vm->vm_vmid);
385		vm->vm_state |= VM_STATE_PAUSED;
386		break;
387	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
388		IMSG_SIZE_CHECK(imsg, &vmr);
389		memcpy(&vmr, imsg->data, sizeof(vmr));
390		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
391			break;
392		proc_compose_imsg(ps, PROC_CONTROL, -1,
393		    imsg->hdr.type, imsg->hdr.peerid, -1,
394		    imsg->data, sizeof(imsg->data));
395		log_info("%s: unpaused vm %d successfully.",
396		    vm->vm_params.vmc_params.vcp_name,
397		    vm->vm_vmid);
398		vm->vm_state &= ~VM_STATE_PAUSED;
399		break;
400	case IMSG_VMDOP_START_VM_RESPONSE:
401		IMSG_SIZE_CHECK(imsg, &vmr);
402		memcpy(&vmr, imsg->data, sizeof(vmr));
403		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
404			break;
405		vm->vm_pid = vmr.vmr_pid;
406		vcp = &vm->vm_params.vmc_params;
407		vcp->vcp_id = vmr.vmr_id;
408
409		/*
410		 * If the peerid is not -1, forward the response back to the
411		 * the control socket.  If it is -1, the request originated
412		 * from the parent, not the control socket.
413		 */
414		if (vm->vm_peerid != (uint32_t)-1) {
415			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
416			    sizeof(vmr.vmr_ttyname));
417			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
418			    imsg->hdr.type, vm->vm_peerid, -1,
419			    &vmr, sizeof(vmr)) == -1) {
420				errno = vmr.vmr_result;
421				log_warn("%s: failed to forward vm result",
422				    vcp->vcp_name);
423				vm_terminate(vm, __func__);
424				return (-1);
425			}
426		}
427
428		if (vmr.vmr_result) {
429			log_warnx("%s: failed to start vm", vcp->vcp_name);
430			vm_terminate(vm, __func__);
431			errno = vmr.vmr_result;
432			break;
433		}
434
435		/* Now configure all the interfaces */
436		if (vm_priv_ifconfig(ps, vm) == -1) {
437			log_warn("%s: failed to configure vm", vcp->vcp_name);
438			vm_terminate(vm, __func__);
439			break;
440		}
441
442		log_info("started %s (vm %d) successfully, tty %s",
443		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
444		break;
445	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
446		IMSG_SIZE_CHECK(imsg, &vmr);
447		memcpy(&vmr, imsg->data, sizeof(vmr));
448
449		if (vmr.vmr_result) {
450			DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
451			    __func__, vmr.vmr_id);
452			proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
453		} else {
454			if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
455				break;
456			/* Mark VM as shutting down */
457			vm->vm_state |= VM_STATE_SHUTDOWN;
458		}
459		break;
460	case IMSG_VMDOP_SEND_VM_RESPONSE:
461		IMSG_SIZE_CHECK(imsg, &vmr);
462		memcpy(&vmr, imsg->data, sizeof(vmr));
463		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
464			break;
465		if (!vmr.vmr_result) {
466			log_info("%s: sent vm %d successfully.",
467			    vm->vm_params.vmc_params.vcp_name,
468			    vm->vm_vmid);
469			vm_terminate(vm, __func__);
470		}
471
472		/* Send a response if a control client is waiting for it */
473		if (imsg->hdr.peerid != (uint32_t)-1) {
474			/* the error is meaningless for deferred responses */
475			vmr.vmr_result = 0;
476
477			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
478			    IMSG_VMDOP_SEND_VM_RESPONSE,
479			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
480				return (-1);
481		}
482		break;
483	case IMSG_VMDOP_TERMINATE_VM_EVENT:
484		IMSG_SIZE_CHECK(imsg, &vmr);
485		memcpy(&vmr, imsg->data, sizeof(vmr));
486		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
487		    __func__, vmr.vmr_id, vmr.vmr_result);
488		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
489			log_debug("%s: vm %d is no longer available",
490			    __func__, vmr.vmr_id);
491			break;
492		}
493		if (vmr.vmr_result != EAGAIN ||
494		    vm->vm_params.vmc_bootdevice) {
495			vm_terminate(vm, __func__);
496		} else {
497			/* Stop VM instance but keep the tty open */
498			vm_stop(vm, 1, __func__);
499			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
500		}
501
502		/* The error is meaningless for deferred responses */
503		vmr.vmr_result = 0;
504
505		if (proc_compose_imsg(ps, PROC_CONTROL, -1,
506			IMSG_VMDOP_TERMINATE_VM_EVENT,
507			imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
508			return (-1);
509		break;
510	case IMSG_VMDOP_GET_INFO_VM_DATA:
511		IMSG_SIZE_CHECK(imsg, &vir);
512		memcpy(&vir, imsg->data, sizeof(vir));
513		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
514			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
515			if (vm->vm_ttyname[0] != '\0')
516				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
517				    sizeof(vir.vir_ttyname));
518			log_debug("%s: running vm: %d, vm_state: 0x%x",
519			    __func__, vm->vm_vmid, vm->vm_state);
520			vir.vir_state = vm->vm_state;
521			/* get the user id who started the vm */
522			vir.vir_uid = vm->vm_uid;
523			vir.vir_gid = vm->vm_params.vmc_owner.gid;
524		}
525		if (proc_compose_imsg(ps,
526		    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
527		    PROC_AGENTX : PROC_CONTROL, -1, imsg->hdr.type,
528		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
529			if (vm)
530				vm_terminate(vm, __func__);
531			return (-1);
532		}
533		break;
534	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
535		/*
536		 * PROC_VMM has responded with the *running* VMs, now we
537		 * append the others. These use the special value 0 for their
538		 * kernel id to indicate that they are not running.
539		 */
540		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
541			if (!(vm->vm_state & VM_STATE_RUNNING)) {
542				memset(&vir, 0, sizeof(vir));
543				vir.vir_info.vir_id = vm->vm_vmid;
544				strlcpy(vir.vir_info.vir_name,
545				    vm->vm_params.vmc_params.vcp_name,
546				    VMM_MAX_NAME_LEN);
547				vir.vir_info.vir_memory_size =
548				    vm->vm_params.vmc_params.
549				    vcp_memranges[0].vmr_size;
550				vir.vir_info.vir_ncpus =
551				    vm->vm_params.vmc_params.vcp_ncpus;
552				/* get the configured user id for this vm */
553				vir.vir_uid = vm->vm_params.vmc_owner.uid;
554				vir.vir_gid = vm->vm_params.vmc_owner.gid;
555				log_debug("%s: vm: %d, vm_state: 0x%x",
556				    __func__, vm->vm_vmid, vm->vm_state);
557				vir.vir_state = vm->vm_state;
558				if (proc_compose_imsg(ps,
559				    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
560				    PROC_AGENTX : PROC_CONTROL, -1,
561				    IMSG_VMDOP_GET_INFO_VM_DATA,
562				    imsg->hdr.peerid, -1, &vir,
563				    sizeof(vir)) == -1) {
564					log_debug("%s: GET_INFO_VM_END failed",
565					    __func__);
566					vm_terminate(vm, __func__);
567					return (-1);
568				}
569			}
570		}
571		IMSG_SIZE_CHECK(imsg, &res);
572		proc_forward_imsg(ps, imsg,
573		    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
574		    PROC_AGENTX : PROC_CONTROL, -1);
575		break;
576	default:
577		return (-1);
578	}
579
580	return (0);
581}
582
583int
584vmd_dispatch_agentx(int fd, struct privsep_proc *p, struct imsg *imsg)
585{
586	struct privsep			*ps = p->p_ps;
587
588	switch (imsg->hdr.type) {
589	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
590		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
591		return (0);
592	default:
593		break;
594	}
595	return (-1);
596}
597
598int
599vmd_dispatch_priv(int fd, struct privsep_proc *p, struct imsg *imsg)
600{
601	struct vmop_addr_result	 var;
602
603	switch (imsg->hdr.type) {
604	case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
605		IMSG_SIZE_CHECK(imsg, &var);
606		memcpy(&var, imsg->data, sizeof(var));
607		proc_forward_imsg(p->p_ps, imsg, PROC_VMM, -1);
608		break;
609	default:
610		return (-1);
611	}
612
613	return (0);
614}
615
616int
617vmd_check_vmh(struct vm_dump_header *vmh)
618{
619	int i;
620	unsigned int code, leaf;
621	unsigned int a, b, c, d;
622
623	if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
624		log_warnx("%s: incompatible dump signature", __func__);
625		return (-1);
626	}
627
628	if (vmh->vmh_version != VM_DUMP_VERSION) {
629		log_warnx("%s: incompatible dump version", __func__);
630		return (-1);
631	}
632
633	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
634		code = vmh->vmh_cpuids[i].code;
635		leaf = vmh->vmh_cpuids[i].leaf;
636		if (leaf != 0x00) {
637			log_debug("%s: invalid leaf 0x%x for code 0x%x",
638			    __func__, leaf, code);
639			return (-1);
640		}
641
642		switch (code) {
643		case 0x00:
644			CPUID_LEAF(code, leaf, a, b, c, d);
645			if (vmh->vmh_cpuids[i].a > a) {
646				log_debug("%s: incompatible cpuid level",
647				    __func__);
648				return (-1);
649			}
650			if (!(vmh->vmh_cpuids[i].b == b &&
651			    vmh->vmh_cpuids[i].c == c &&
652			    vmh->vmh_cpuids[i].d == d)) {
653				log_debug("%s: incompatible cpu brand",
654				    __func__);
655				return (-1);
656			}
657			break;
658
659		case 0x01:
660			CPUID_LEAF(code, leaf, a, b, c, d);
661			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
662			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
663				log_debug("%s: incompatible cpu features "
664				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
665				    code, leaf);
666				return (-1);
667			}
668			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
669			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
670				log_debug("%s: incompatible cpu features "
671				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
672				    code, leaf);
673				return (-1);
674			}
675			break;
676
677		case 0x07:
678			CPUID_LEAF(code, leaf, a, b, c, d);
679			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
680			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
681				log_debug("%s: incompatible cpu features "
682				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
683				    code, leaf);
684				return (-1);
685			}
686			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
687			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
688				log_debug("%s: incompatible cpu features "
689				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
690				    code, leaf);
691				return (-1);
692			}
693			break;
694
695		case 0x0d:
696			CPUID_LEAF(code, leaf, a, b, c, d);
697			if (vmh->vmh_cpuids[i].b > b) {
698				log_debug("%s: incompatible cpu: insufficient "
699				    "max save area for enabled XCR0 features",
700				    __func__);
701				return (-1);
702			}
703			if (vmh->vmh_cpuids[i].c > c) {
704				log_debug("%s: incompatible cpu: insufficient "
705				    "max save area for supported XCR0 features",
706				    __func__);
707				return (-1);
708			}
709			break;
710
711		case 0x80000001:
712			CPUID_LEAF(code, leaf, a, b, c, d);
713			if ((vmh->vmh_cpuids[i].a & a) !=
714			    vmh->vmh_cpuids[i].a) {
715				log_debug("%s: incompatible cpu features "
716				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
717				    code, leaf);
718				return (-1);
719			}
720			if ((vmh->vmh_cpuids[i].c & c) !=
721			    vmh->vmh_cpuids[i].c) {
722				log_debug("%s: incompatible cpu features "
723				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
724				    code, leaf);
725				return (-1);
726			}
727			if ((vmh->vmh_cpuids[i].d & d) !=
728			    vmh->vmh_cpuids[i].d) {
729				log_debug("%s: incompatible cpu features "
730				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
731				    code, leaf);
732				return (-1);
733			}
734			break;
735
736		default:
737			log_debug("%s: unknown code 0x%x", __func__, code);
738			return (-1);
739		}
740	}
741
742	return (0);
743}
744
745void
746vmd_sighdlr(int sig, short event, void *arg)
747{
748	if (privsep_process != PROC_PARENT)
749		return;
750	log_debug("%s: handling signal", __func__);
751
752	switch (sig) {
753	case SIGHUP:
754		log_info("%s: reload requested with SIGHUP", __func__);
755
756		/*
757		 * This is safe because libevent uses async signal handlers
758		 * that run in the event loop and not in signal context.
759		 */
760		(void)vmd_reload(0, NULL);
761		break;
762	case SIGPIPE:
763		log_info("%s: ignoring SIGPIPE", __func__);
764		break;
765	case SIGUSR1:
766		log_info("%s: ignoring SIGUSR1", __func__);
767		break;
768	case SIGTERM:
769	case SIGINT:
770		vmd_shutdown();
771		break;
772	default:
773		fatalx("unexpected signal");
774	}
775}
776
777__dead void
778usage(void)
779{
780	extern char *__progname;
781	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
782	    __progname);
783	exit(1);
784}
785
786int
787main(int argc, char **argv)
788{
789	struct privsep		*ps;
790	int			 ch;
791	enum privsep_procid	 proc_id = PROC_PARENT;
792	int			 proc_instance = 0, vm_launch = 0;
793	int			 vmm_fd = -1, vm_fd = -1;
794	const char		*errp, *title = NULL;
795	int			 argc0 = argc;
796	char			 dev_type = '\0';
797
798	log_init(0, LOG_DAEMON);
799
800	if ((env = calloc(1, sizeof(*env))) == NULL)
801		fatal("calloc: env");
802	env->vmd_fd = -1;
803	env->vmd_fd6 = -1;
804
805	while ((ch = getopt(argc, argv, "D:P:I:V:X:df:i:nt:vp:")) != -1) {
806		switch (ch) {
807		case 'D':
808			if (cmdline_symset(optarg) < 0)
809				log_warnx("could not parse macro definition %s",
810				    optarg);
811			break;
812		case 'd':
813			env->vmd_debug = 2;
814			break;
815		case 'f':
816			conffile = optarg;
817			break;
818		case 'v':
819			env->vmd_verbose++;
820			break;
821		/* vmd fork/exec */
822		case 'n':
823			env->vmd_noaction = 1;
824			break;
825		case 'P':
826			title = optarg;
827			proc_id = proc_getid(procs, nitems(procs), title);
828			if (proc_id == PROC_MAX)
829				fatalx("invalid process name");
830			break;
831		case 'I':
832			proc_instance = strtonum(optarg, 0,
833			    PROC_MAX_INSTANCES, &errp);
834			if (errp)
835				fatalx("invalid process instance");
836			break;
837		/* child vm and device fork/exec */
838		case 'p':
839			title = optarg;
840			break;
841		case 'V':
842			vm_launch = VMD_LAUNCH_VM;
843			vm_fd = strtonum(optarg, 0, 128, &errp);
844			if (errp)
845				fatalx("invalid vm fd");
846			break;
847		case 'X':
848			vm_launch = VMD_LAUNCH_DEV;
849			vm_fd = strtonum(optarg, 0, 128, &errp);
850			if (errp)
851				fatalx("invalid device fd");
852			break;
853		case 't':
854			dev_type = *optarg;
855			switch (dev_type) {
856			case VMD_DEVTYPE_NET:
857			case VMD_DEVTYPE_DISK:
858				break;
859			default: fatalx("invalid device type");
860			}
861			break;
862		case 'i':
863			vmm_fd = strtonum(optarg, 0, 128, &errp);
864			if (errp)
865				fatalx("invalid vmm fd");
866			break;
867		default:
868			usage();
869		}
870	}
871
872	argc -= optind;
873	if (argc > 0)
874		usage();
875
876	if (env->vmd_noaction && !env->vmd_debug)
877		env->vmd_debug = 1;
878
879	log_init(env->vmd_debug, LOG_DAEMON);
880	log_setverbose(env->vmd_verbose);
881
882	/* Re-exec from the vmm child process requires an absolute path. */
883	if (proc_id == PROC_PARENT && *argv[0] != '/' && !env->vmd_noaction)
884		fatalx("re-exec requires execution with an absolute path");
885	env->argv0 = argv[0];
886
887	/* check for root privileges */
888	if (env->vmd_noaction == 0 && !vm_launch) {
889		if (geteuid())
890			fatalx("need root privileges");
891	}
892
893	ps = &env->vmd_ps;
894	ps->ps_env = env;
895
896	if (config_init(env) == -1)
897		fatal("failed to initialize configuration");
898
899	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
900		fatal("unknown user %s", VMD_USER);
901
902	/* First proc runs as root without pledge but in default chroot */
903	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
904	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
905
906	/*
907	 * If we're launching a new vm or its device, we short out here.
908	 */
909	if (vm_launch == VMD_LAUNCH_VM) {
910		vm_main(vm_fd, vmm_fd);
911		/* NOTREACHED */
912	} else if (vm_launch == VMD_LAUNCH_DEV) {
913		if (dev_type == VMD_DEVTYPE_NET) {
914			log_procinit("vm/%s/vionet", title);
915			vionet_main(vm_fd, vmm_fd);
916			/* NOTREACHED */
917		} else if (dev_type == VMD_DEVTYPE_DISK) {
918			log_procinit("vm/%s/vioblk", title);
919			vioblk_main(vm_fd, vmm_fd);
920			/* NOTREACHED */
921		}
922		fatalx("unsupported device type '%c'", dev_type);
923	}
924
925	/* Open /dev/vmm early. */
926	if (env->vmd_noaction == 0 && proc_id == PROC_PARENT) {
927		env->vmd_fd = open(VMM_NODE, O_RDWR | O_CLOEXEC);
928		if (env->vmd_fd == -1)
929			fatal("%s", VMM_NODE);
930	}
931
932	/* Configure the control socket */
933	ps->ps_csock.cs_name = SOCKET_NAME;
934	TAILQ_INIT(&ps->ps_rcsocks);
935
936	/* Configuration will be parsed after forking the children */
937	env->vmd_conffile = conffile;
938
939	if (env->vmd_noaction)
940		ps->ps_noaction = 1;
941	ps->ps_instance = proc_instance;
942	if (title != NULL)
943		ps->ps_title[proc_id] = title;
944
945	/* only the parent returns */
946	proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv,
947	    proc_id);
948
949	if (ps->ps_noaction == 0)
950		log_info("startup");
951
952	event_init();
953
954	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
955	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
956	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
957	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
958	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
959
960	signal_add(&ps->ps_evsigint, NULL);
961	signal_add(&ps->ps_evsigterm, NULL);
962	signal_add(&ps->ps_evsighup, NULL);
963	signal_add(&ps->ps_evsigpipe, NULL);
964	signal_add(&ps->ps_evsigusr1, NULL);
965
966	if (!env->vmd_noaction)
967		proc_connect(ps);
968
969	if (vmd_configure() == -1)
970		fatalx("configuration failed");
971
972	event_dispatch();
973
974	log_debug("exiting");
975
976	return (0);
977}
978
979void
980start_vm_batch(int fd, short type, void *args)
981{
982	int		i = 0;
983	struct vmd_vm	*vm;
984
985	log_debug("%s: starting batch of %d vms", __func__,
986	    env->vmd_cfg.parallelism);
987	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
988		if (!(vm->vm_state & VM_STATE_WAITING)) {
989			log_debug("%s: not starting vm %s (disabled)",
990			    __func__,
991			    vm->vm_params.vmc_params.vcp_name);
992			continue;
993		}
994		i++;
995		if (i > env->vmd_cfg.parallelism) {
996			evtimer_add(&staggered_start_timer,
997			    &env->vmd_cfg.delay);
998			break;
999		}
1000		vm->vm_state &= ~VM_STATE_WAITING;
1001		config_setvm(&env->vmd_ps, vm, -1, vm->vm_params.vmc_owner.uid);
1002	}
1003	log_debug("%s: done starting vms", __func__);
1004}
1005
1006int
1007vmd_configure(void)
1008{
1009	int			ncpus;
1010	struct vmd_switch	*vsw;
1011	int ncpu_mib[] = {CTL_HW, HW_NCPUONLINE};
1012	size_t ncpus_sz = sizeof(ncpus);
1013
1014	/*
1015	 * pledge in the parent process:
1016	 * stdio - for malloc and basic I/O including events.
1017	 * rpath - for reload to open and read the configuration files.
1018	 * wpath - for opening disk images and tap devices.
1019	 * tty - for openpty and TIOCUCNTL.
1020	 * proc - run kill to terminate its children safely.
1021	 * sendfd - for disks, interfaces and other fds.
1022	 * recvfd - for send and receive.
1023	 * getpw - lookup user or group id by name.
1024	 * chown, fattr - change tty ownership
1025	 * flock - locking disk files
1026	 */
1027	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
1028	    " chown fattr flock", NULL) == -1)
1029		fatal("pledge");
1030
1031	if ((env->vmd_ptmfd = getptmfd()) == -1)
1032		fatal("getptmfd %s", PATH_PTMDEV);
1033
1034	if (parse_config(env->vmd_conffile) == -1) {
1035		proc_kill(&env->vmd_ps);
1036		exit(1);
1037	}
1038
1039	if (env->vmd_noaction) {
1040		fprintf(stderr, "configuration OK\n");
1041		proc_kill(&env->vmd_ps);
1042		exit(0);
1043	}
1044
1045	/* Send VMM device fd to vmm proc. */
1046	proc_compose_imsg(&env->vmd_ps, PROC_VMM, -1,
1047	    IMSG_VMDOP_RECEIVE_VMM_FD, -1, env->vmd_fd, NULL, 0);
1048
1049	/* Send shared global configuration to all children */
1050	if (config_setconfig(env) == -1)
1051		return (-1);
1052
1053	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1054		if (vsw->sw_running)
1055			continue;
1056		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1057			log_warn("%s: failed to create switch %s",
1058			    __func__, vsw->sw_name);
1059			switch_remove(vsw);
1060			return (-1);
1061		}
1062	}
1063
1064	if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) {
1065		env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY;
1066		if (sysctl(ncpu_mib, nitems(ncpu_mib), &ncpus, &ncpus_sz, NULL, 0) == -1)
1067			ncpus = 1;
1068		env->vmd_cfg.parallelism = ncpus;
1069		log_debug("%s: setting staggered start configuration to "
1070		    "parallelism: %d and delay: %lld",
1071		    __func__, ncpus, (long long) env->vmd_cfg.delay.tv_sec);
1072	}
1073
1074	log_debug("%s: starting vms in staggered fashion", __func__);
1075	evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1076	/* start first batch */
1077	start_vm_batch(0, 0, NULL);
1078
1079	return (0);
1080}
1081
1082int
1083vmd_reload(unsigned int reset, const char *filename)
1084{
1085	struct vmd_vm		*vm, *next_vm;
1086	struct vmd_switch	*vsw;
1087	int			 reload = 0;
1088
1089	/* Switch back to the default config file */
1090	if (filename == NULL || *filename == '\0') {
1091		filename = env->vmd_conffile;
1092		reload = 1;
1093	}
1094
1095	log_debug("%s: level %d config file %s", __func__, reset, filename);
1096
1097	if (reset) {
1098		/* Purge the configuration */
1099		config_purge(env, reset);
1100		config_setreset(env, reset);
1101	} else {
1102		/*
1103		 * Load or reload the configuration.
1104		 *
1105		 * Reloading removes all non-running VMs before processing the
1106		 * config file, whereas loading only adds to the existing list
1107		 * of VMs.
1108		 */
1109
1110		if (reload) {
1111			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
1112			    next_vm) {
1113				if (!(vm->vm_state & VM_STATE_RUNNING)) {
1114					DPRINTF("%s: calling vm_remove",
1115					    __func__);
1116					vm_remove(vm, __func__);
1117				}
1118			}
1119		}
1120
1121		if (parse_config(filename) == -1) {
1122			log_debug("%s: failed to load config file %s",
1123			    __func__, filename);
1124			return (-1);
1125		}
1126
1127		if (reload) {
1128			/* Update shared global configuration in all children */
1129			if (config_setconfig(env) == -1)
1130				return (-1);
1131		}
1132
1133		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1134			if (vsw->sw_running)
1135				continue;
1136			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1137				log_warn("%s: failed to create switch %s",
1138				    __func__, vsw->sw_name);
1139				switch_remove(vsw);
1140				return (-1);
1141			}
1142		}
1143
1144		log_debug("%s: starting vms in staggered fashion", __func__);
1145		evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1146		/* start first batch */
1147		start_vm_batch(0, 0, NULL);
1148
1149		}
1150
1151	return (0);
1152}
1153
1154void
1155vmd_shutdown(void)
1156{
1157	struct vmd_vm *vm, *vm_next;
1158
1159	log_debug("%s: performing shutdown", __func__);
1160
1161	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
1162		vm_remove(vm, __func__);
1163	}
1164
1165	proc_kill(&env->vmd_ps);
1166	free(env);
1167
1168	log_warnx("terminating");
1169	exit(0);
1170}
1171
1172struct vmd_vm *
1173vm_getbyvmid(uint32_t vmid)
1174{
1175	struct vmd_vm	*vm;
1176
1177	if (vmid == 0)
1178		return (NULL);
1179	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1180		if (vm->vm_vmid == vmid)
1181			return (vm);
1182	}
1183
1184	return (NULL);
1185}
1186
1187struct vmd_vm *
1188vm_getbyid(uint32_t id)
1189{
1190	struct vmd_vm	*vm;
1191
1192	if (id == 0)
1193		return (NULL);
1194	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1195		if (vm->vm_params.vmc_params.vcp_id == id)
1196			return (vm);
1197	}
1198
1199	return (NULL);
1200}
1201
1202uint32_t
1203vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1204{
1205	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1206		return (0);
1207	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1208	    id, vm->vm_vmid);
1209	return (vm->vm_vmid);
1210}
1211
1212uint32_t
1213vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1214{
1215	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1216		return (0);
1217	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1218	    vmid, vm->vm_params.vmc_params.vcp_id);
1219	return (vm->vm_params.vmc_params.vcp_id);
1220}
1221
1222struct vmd_vm *
1223vm_getbyname(const char *name)
1224{
1225	struct vmd_vm	*vm;
1226
1227	if (name == NULL)
1228		return (NULL);
1229	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1230		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1231			return (vm);
1232	}
1233
1234	return (NULL);
1235}
1236
1237struct vmd_vm *
1238vm_getbypid(pid_t pid)
1239{
1240	struct vmd_vm	*vm;
1241
1242	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1243		if (vm->vm_pid == pid)
1244			return (vm);
1245	}
1246
1247	return (NULL);
1248}
1249
1250void
1251vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1252{
1253	struct privsep	*ps = &env->vmd_ps;
1254	unsigned int	 i, j;
1255
1256	if (vm == NULL)
1257		return;
1258
1259	log_debug("%s: %s %s stopping vm %d%s",
1260	    __func__, ps->ps_title[privsep_process], caller,
1261	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1262
1263	vm->vm_state &= ~(VM_STATE_RECEIVED | VM_STATE_RUNNING
1264	    | VM_STATE_SHUTDOWN);
1265
1266	if (vm->vm_iev.ibuf.fd != -1) {
1267		event_del(&vm->vm_iev.ev);
1268		close(vm->vm_iev.ibuf.fd);
1269	}
1270	for (i = 0; i < VM_MAX_DISKS_PER_VM; i++) {
1271		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
1272			if (vm->vm_disks[i][j] != -1) {
1273				close(vm->vm_disks[i][j]);
1274				vm->vm_disks[i][j] = -1;
1275			}
1276		}
1277	}
1278	for (i = 0; i < VM_MAX_NICS_PER_VM; i++) {
1279		if (vm->vm_ifs[i].vif_fd != -1) {
1280			close(vm->vm_ifs[i].vif_fd);
1281			vm->vm_ifs[i].vif_fd = -1;
1282		}
1283		free(vm->vm_ifs[i].vif_name);
1284		free(vm->vm_ifs[i].vif_switch);
1285		free(vm->vm_ifs[i].vif_group);
1286		vm->vm_ifs[i].vif_name = NULL;
1287		vm->vm_ifs[i].vif_switch = NULL;
1288		vm->vm_ifs[i].vif_group = NULL;
1289	}
1290	if (vm->vm_kernel != -1) {
1291		close(vm->vm_kernel);
1292		vm->vm_kernel = -1;
1293	}
1294	if (vm->vm_cdrom != -1) {
1295		close(vm->vm_cdrom);
1296		vm->vm_cdrom = -1;
1297	}
1298	if (!keeptty) {
1299		vm_closetty(vm);
1300		vm->vm_uid = 0;
1301	}
1302}
1303
1304void
1305vm_remove(struct vmd_vm *vm, const char *caller)
1306{
1307	struct privsep	*ps = &env->vmd_ps;
1308
1309	if (vm == NULL)
1310		return;
1311
1312	log_debug("%s: %s %s removing vm %d from running config",
1313	    __func__, ps->ps_title[privsep_process], caller,
1314	    vm->vm_vmid);
1315
1316	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1317
1318	vm_stop(vm, 0, caller);
1319	if (vm->vm_kernel_path != NULL && !vm->vm_from_config)
1320		free(vm->vm_kernel_path);
1321	free(vm);
1322}
1323
1324int
1325vm_claimid(const char *name, int uid, uint32_t *id)
1326{
1327	struct name2id *n2i = NULL;
1328
1329	TAILQ_FOREACH(n2i, env->vmd_known, entry)
1330		if (strcmp(n2i->name, name) == 0 && n2i->uid == uid)
1331			goto out;
1332
1333	if (++env->vmd_nvm == 0) {
1334		log_warnx("too many vms");
1335		return (-1);
1336	}
1337	if ((n2i = calloc(1, sizeof(struct name2id))) == NULL) {
1338		log_warnx("could not alloc vm name");
1339		return (-1);
1340	}
1341	n2i->id = env->vmd_nvm;
1342	n2i->uid = uid;
1343	if (strlcpy(n2i->name, name, sizeof(n2i->name)) >= sizeof(n2i->name)) {
1344		log_warnx("vm name too long");
1345		free(n2i);
1346		return (-1);
1347	}
1348	TAILQ_INSERT_TAIL(env->vmd_known, n2i, entry);
1349
1350out:
1351	*id = n2i->id;
1352	return (0);
1353}
1354
1355int
1356vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1357    struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1358{
1359	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1360	struct vm_create_params	*vcp = &vmc->vmc_params;
1361	struct vmop_owner	*vmo = NULL;
1362	uint32_t		 nid, rng;
1363	unsigned int		 i, j;
1364	struct vmd_switch	*sw;
1365	char			*s;
1366	int			 ret = 0;
1367
1368	/* Check if this is an instance of another VM */
1369	if ((ret = vm_instance(ps, &vm_parent, vmc, uid)) != 0) {
1370		errno = ret; /* XXX might set invalid errno */
1371		return (-1);
1372	}
1373
1374	errno = 0;
1375	*ret_vm = NULL;
1376
1377	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1378	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1379		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1380		    uid) != 0) {
1381			errno = EPERM;
1382			goto fail;
1383		}
1384		vm->vm_kernel = vmc->vmc_kernel;
1385		*ret_vm = vm;
1386		errno = EALREADY;
1387		goto fail;
1388	}
1389
1390	if (vm_parent != NULL)
1391		vmo = &vm_parent->vm_params.vmc_insowner;
1392
1393	/* non-root users can only start existing VMs or instances */
1394	if (vm_checkperm(NULL, vmo, uid) != 0) {
1395		log_warnx("permission denied");
1396		errno = EPERM;
1397		goto fail;
1398	}
1399	if (vmc->vmc_flags == 0) {
1400		log_warnx("invalid configuration, no devices");
1401		errno = VMD_DISK_MISSING;
1402		goto fail;
1403	}
1404	if (vcp->vcp_ncpus == 0)
1405		vcp->vcp_ncpus = 1;
1406	if (vcp->vcp_memranges[0].vmr_size == 0)
1407		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1408	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1409		log_warnx("invalid number of CPUs");
1410		goto fail;
1411	} else if (vmc->vmc_ndisks > VM_MAX_DISKS_PER_VM) {
1412		log_warnx("invalid number of disks");
1413		goto fail;
1414	} else if (vmc->vmc_nnics > VM_MAX_NICS_PER_VM) {
1415		log_warnx("invalid number of interfaces");
1416		goto fail;
1417	} else if (vmc->vmc_kernel == -1 && vmc->vmc_ndisks == 0
1418	    && strlen(vmc->vmc_cdrom) == 0) {
1419		log_warnx("no kernel or disk/cdrom specified");
1420		goto fail;
1421	} else if (strlen(vcp->vcp_name) == 0) {
1422		log_warnx("invalid VM name");
1423		goto fail;
1424	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1425	    *vcp->vcp_name == '_') {
1426		log_warnx("invalid VM name");
1427		goto fail;
1428	} else {
1429		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1430			if (!(isalnum((unsigned char)*s) || *s == '.' || \
1431			    *s == '-' || *s == '_')) {
1432				log_warnx("invalid VM name");
1433				goto fail;
1434			}
1435		}
1436	}
1437
1438	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1439		goto fail;
1440
1441	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1442	vmc = &vm->vm_params;
1443	vcp = &vmc->vmc_params;
1444	vm->vm_pid = -1;
1445	vm->vm_tty = -1;
1446	vm->vm_receive_fd = -1;
1447	vm->vm_kernel = -1;
1448	vm->vm_state &= ~VM_STATE_PAUSED;
1449
1450	if (vmc->vmc_kernel > -1)
1451		vm->vm_kernel = vmc->vmc_kernel;
1452
1453	for (i = 0; i < VM_MAX_DISKS_PER_VM; i++)
1454		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
1455			vm->vm_disks[i][j] = -1;
1456	for (i = 0; i < VM_MAX_NICS_PER_VM; i++)
1457		vm->vm_ifs[i].vif_fd = -1;
1458	for (i = 0; i < vmc->vmc_nnics; i++) {
1459		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1460			/* inherit per-interface flags from the switch */
1461			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1462		}
1463
1464		/*
1465		 * If the MAC address is zero, always randomize it in vmd(8)
1466		 * because we cannot rely on the guest OS to do the right
1467		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1468		 * from the kernel, incremented by one to differentiate
1469		 * the source.
1470		 */
1471		if (memcmp(zero_mac, &vmc->vmc_macs[i], ETHER_ADDR_LEN) == 0) {
1472			rng = arc4random();
1473			vmc->vmc_macs[i][0] = 0xfe;
1474			vmc->vmc_macs[i][1] = 0xe1;
1475			vmc->vmc_macs[i][2] = 0xba + 1;
1476			vmc->vmc_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1477			vmc->vmc_macs[i][4] = rng;
1478			vmc->vmc_macs[i][5] = rng >> 8;
1479		}
1480	}
1481	vm->vm_cdrom = -1;
1482	vm->vm_iev.ibuf.fd = -1;
1483
1484	/*
1485	 * Assign a new internal Id if not specified and we succeed in
1486	 * claiming a new Id.
1487	 */
1488	if (id != 0)
1489		vm->vm_vmid = id;
1490	else if (vm_claimid(vcp->vcp_name, uid, &nid) == -1)
1491		goto fail;
1492	else
1493		vm->vm_vmid = nid;
1494
1495	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1496	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1497
1498	*ret_vm = vm;
1499	return (0);
1500 fail:
1501	if (errno == 0)
1502		errno = EINVAL;
1503	return (-1);
1504}
1505
1506int
1507vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1508    struct vmop_create_params *vmc, uid_t uid)
1509{
1510	char			*name;
1511	struct vm_create_params	*vcp = &vmc->vmc_params;
1512	struct vmop_create_params *vmcp;
1513	struct vm_create_params	*vcpp;
1514	unsigned int		 i, j;
1515
1516	/* return without error if the parent is NULL (nothing to inherit) */
1517	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1518	    vmc->vmc_instance[0] == '\0')
1519		return (0);
1520
1521	if ((*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL) {
1522		return (VMD_PARENT_INVALID);
1523	}
1524
1525	vmcp = &(*vm_parent)->vm_params;
1526	vcpp = &vmcp->vmc_params;
1527
1528	/* Are we allowed to create an instance from this VM? */
1529	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1530		log_warnx("vm \"%s\" no permission to create vm instance",
1531		    vcpp->vcp_name);
1532		return (ENAMETOOLONG);
1533	}
1534
1535	name = vcp->vcp_name;
1536
1537	if (vm_getbyname(vcp->vcp_name) != NULL ||
1538	    vm_getbyvmid(vcp->vcp_id) != NULL) {
1539		return (EPROCLIM);
1540	}
1541
1542	/* CPU */
1543	if (vcp->vcp_ncpus == 0)
1544		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1545	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1546	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1547		log_warnx("vm \"%s\" no permission to set cpus", name);
1548		return (EPERM);
1549	}
1550
1551	/* memory */
1552	if (vcp->vcp_memranges[0].vmr_size == 0)
1553		vcp->vcp_memranges[0].vmr_size =
1554		    vcpp->vcp_memranges[0].vmr_size;
1555	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1556	    vcp->vcp_memranges[0].vmr_size !=
1557	    vcpp->vcp_memranges[0].vmr_size) {
1558		log_warnx("vm \"%s\" no permission to set memory", name);
1559		return (EPERM);
1560	}
1561
1562	/* disks cannot be inherited */
1563	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1564	    vmc->vmc_ndisks) {
1565		log_warnx("vm \"%s\" no permission to set disks", name);
1566		return (EPERM);
1567	}
1568	for (i = 0; i < vmc->vmc_ndisks; i++) {
1569		/* Check if this disk is already used in the parent */
1570		for (j = 0; j < vmcp->vmc_ndisks; j++) {
1571			if (strcmp(vmc->vmc_disks[i],
1572			    vmcp->vmc_disks[j]) == 0) {
1573				log_warnx("vm \"%s\" disk %s cannot be reused",
1574				    name, vmc->vmc_disks[i]);
1575				return (EBUSY);
1576			}
1577		}
1578		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1579	}
1580
1581	/* interfaces */
1582	if (vmc->vmc_nnics > 0 &&
1583	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1584	    vmc->vmc_nnics != vmcp->vmc_nnics) {
1585		log_warnx("vm \"%s\" no permission to set interfaces", name);
1586		return (EPERM);
1587	}
1588	for (i = 0; i < vmcp->vmc_nnics; i++) {
1589		/* Interface got overwritten */
1590		if (i < vmc->vmc_nnics)
1591			continue;
1592
1593		/* Copy interface from parent */
1594		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1595		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1596		    sizeof(vmc->vmc_ifnames[i]));
1597		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1598		    sizeof(vmc->vmc_ifswitch[i]));
1599		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1600		    sizeof(vmc->vmc_ifgroup[i]));
1601		memcpy(vmc->vmc_macs[i], vmcp->vmc_macs[i],
1602		    sizeof(vmc->vmc_macs[i]));
1603		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1604		vmc->vmc_nnics++;
1605	}
1606	for (i = 0; i < vmc->vmc_nnics; i++) {
1607		for (j = 0; j < vmcp->vmc_nnics; j++) {
1608			if (memcmp(zero_mac, vmc->vmc_macs[i],
1609			    sizeof(vmc->vmc_macs[i])) != 0 &&
1610			    memcmp(vmcp->vmc_macs[i], vmc->vmc_macs[i],
1611			    sizeof(vmc->vmc_macs[i])) != 0) {
1612				log_warnx("vm \"%s\" lladdr cannot be reused",
1613				    name);
1614				return (EBUSY);
1615			}
1616			if (strlen(vmc->vmc_ifnames[i]) &&
1617			    strcmp(vmc->vmc_ifnames[i],
1618			    vmcp->vmc_ifnames[j]) == 0) {
1619				log_warnx("vm \"%s\" %s cannot be reused",
1620				    vmc->vmc_ifnames[i], name);
1621				return (EBUSY);
1622			}
1623		}
1624	}
1625
1626	/* kernel */
1627	if (vmc->vmc_kernel > -1 || ((*vm_parent)->vm_kernel_path != NULL &&
1628		strnlen((*vm_parent)->vm_kernel_path, PATH_MAX) < PATH_MAX)) {
1629		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1630			log_warnx("vm \"%s\" no permission to set boot image",
1631			    name);
1632			return (EPERM);
1633		}
1634		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1635	}
1636
1637	/* cdrom */
1638	if (strlen(vmc->vmc_cdrom) > 0) {
1639		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1640			log_warnx("vm \"%s\" no permission to set cdrom", name);
1641			return (EPERM);
1642		}
1643		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1644	} else if (strlcpy(vmc->vmc_cdrom, vmcp->vmc_cdrom,
1645	    sizeof(vmc->vmc_cdrom)) >= sizeof(vmc->vmc_cdrom)) {
1646		log_warnx("vm \"%s\" cdrom name too long", name);
1647		return (EINVAL);
1648	}
1649
1650	/* user */
1651	if (vmc->vmc_owner.uid == 0)
1652		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1653	else if (vmc->vmc_owner.uid != uid &&
1654	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1655		log_warnx("vm \"%s\" user mismatch", name);
1656		return (EPERM);
1657	}
1658
1659	/* group */
1660	if (vmc->vmc_owner.gid == 0)
1661		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1662	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1663		log_warnx("vm \"%s\" group mismatch", name);
1664		return (EPERM);
1665	}
1666
1667	/* child instances */
1668	if (vmc->vmc_insflags) {
1669		log_warnx("vm \"%s\" cannot change instance permissions", name);
1670		return (EPERM);
1671	}
1672	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1673		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1674		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1675		vmc->vmc_insflags = vmcp->vmc_insflags;
1676	} else {
1677		vmc->vmc_insowner.gid = 0;
1678		vmc->vmc_insowner.uid = 0;
1679		vmc->vmc_insflags = 0;
1680	}
1681
1682	/* finished, remove instance flags */
1683	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1684
1685	return (0);
1686}
1687
1688/*
1689 * vm_checkperm
1690 *
1691 * Checks if the user represented by the 'uid' parameter is allowed to
1692 * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1693 * console.)
1694 *
1695 * Parameters:
1696 *  vm: the VM whose permission is to be checked
1697 *  vmo: the required uid/gid to be checked
1698 *  uid: the user ID of the user making the request
1699 *
1700 * Return values:
1701 *   0: the permission should be granted
1702 *  -1: the permission check failed (also returned if vm == null)
1703 */
1704int
1705vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1706{
1707	struct group	*gr;
1708	struct passwd	*pw;
1709	char		**grmem;
1710
1711	/* root has no restrictions */
1712	if (uid == 0)
1713		return (0);
1714
1715	if (vmo == NULL)
1716		return (-1);
1717
1718	/* check user */
1719	if (vm == NULL) {
1720		if  (vmo->uid == uid)
1721			return (0);
1722	} else {
1723		/*
1724		 * check user of running vm (the owner of a running vm can
1725		 * be different to (or more specific than) the configured owner.
1726		 */
1727		if (((vm->vm_state & VM_STATE_RUNNING) && vm->vm_uid == uid) ||
1728		    (!(vm->vm_state & VM_STATE_RUNNING) && vmo->uid == uid))
1729			return (0);
1730	}
1731
1732	/* check groups */
1733	if (vmo->gid != -1) {
1734		if ((pw = getpwuid(uid)) == NULL)
1735			return (-1);
1736		if (pw->pw_gid == vmo->gid)
1737			return (0);
1738		if ((gr = getgrgid(vmo->gid)) != NULL) {
1739			for (grmem = gr->gr_mem; *grmem; grmem++)
1740				if (strcmp(*grmem, pw->pw_name) == 0)
1741					return (0);
1742		}
1743	}
1744
1745	return (-1);
1746}
1747
1748/*
1749 * vm_checkinsflag
1750 *
1751 * Checks whether the non-root user is allowed to set an instance option.
1752 *
1753 * Parameters:
1754 *  vmc: the VM create parameters
1755 *  flag: the flag to be checked
1756 *  uid: the user ID of the user making the request
1757 *
1758 * Return values:
1759 *   0: the permission should be granted
1760 *  -1: the permission check failed (also returned if vm == null)
1761 */
1762int
1763vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1764{
1765	/* root has no restrictions */
1766	if (uid == 0)
1767		return (0);
1768
1769	if ((vmc->vmc_insflags & flag) == 0)
1770		return (-1);
1771
1772	return (0);
1773}
1774
1775/*
1776 * vm_checkaccess
1777 *
1778 * Checks if the user represented by the 'uid' parameter is allowed to
1779 * access the file described by the 'path' parameter.
1780 *
1781 * Parameters:
1782 *  fd: the file descriptor of the opened file
1783 *  uflag: check if the userid has access to the file
1784 *  uid: the user ID of the user making the request
1785 *  amode: the access flags of R_OK and W_OK
1786 *
1787 * Return values:
1788 *   0: the permission should be granted
1789 *  -1: the permission check failed
1790 */
1791int
1792vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1793{
1794	struct group	*gr;
1795	struct passwd	*pw;
1796	char		**grmem;
1797	struct stat	 st;
1798	mode_t		 mode;
1799
1800	if (fd == -1)
1801		return (-1);
1802
1803	/*
1804	 * File has to be accessible and a regular file
1805	 */
1806	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1807		return (-1);
1808
1809	/* root has no restrictions */
1810	if (uid == 0 || uflag == 0)
1811		return (0);
1812
1813	/* check other */
1814	mode = amode & W_OK ? S_IWOTH : 0;
1815	mode |= amode & R_OK ? S_IROTH : 0;
1816	if ((st.st_mode & mode) == mode)
1817		return (0);
1818
1819	/* check user */
1820	mode = amode & W_OK ? S_IWUSR : 0;
1821	mode |= amode & R_OK ? S_IRUSR : 0;
1822	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1823		return (0);
1824
1825	/* check groups */
1826	mode = amode & W_OK ? S_IWGRP : 0;
1827	mode |= amode & R_OK ? S_IRGRP : 0;
1828	if ((st.st_mode & mode) != mode)
1829		return (-1);
1830	if ((pw = getpwuid(uid)) == NULL)
1831		return (-1);
1832	if (pw->pw_gid == st.st_gid)
1833		return (0);
1834	if ((gr = getgrgid(st.st_gid)) != NULL) {
1835		for (grmem = gr->gr_mem; *grmem; grmem++)
1836			if (strcmp(*grmem, pw->pw_name) == 0)
1837				return (0);
1838	}
1839
1840	return (-1);
1841}
1842
1843int
1844vm_opentty(struct vmd_vm *vm)
1845{
1846	struct stat		 st;
1847	struct group		*gr;
1848	uid_t			 uid;
1849	gid_t			 gid;
1850	mode_t			 mode;
1851	int			 on = 1, tty_slave;
1852
1853	/*
1854	 * Open tty with pre-opened PTM fd
1855	 */
1856	if (fdopenpty(env->vmd_ptmfd, &vm->vm_tty, &tty_slave, vm->vm_ttyname,
1857	    NULL, NULL) == -1) {
1858		log_warn("fdopenpty");
1859		return (-1);
1860	}
1861	close(tty_slave);
1862
1863	/*
1864	 * We use user ioctl(2) mode to pass break commands.
1865	 */
1866	if (ioctl(vm->vm_tty, TIOCUCNTL, &on) == -1) {
1867		log_warn("could not enable user ioctl mode on %s",
1868		    vm->vm_ttyname);
1869		goto fail;
1870	}
1871
1872	uid = vm->vm_uid;
1873	gid = vm->vm_params.vmc_owner.gid;
1874
1875	if (vm->vm_params.vmc_owner.gid != -1) {
1876		mode = 0660;
1877	} else if ((gr = getgrnam("tty")) != NULL) {
1878		gid = gr->gr_gid;
1879		mode = 0620;
1880	} else {
1881		mode = 0600;
1882		gid = 0;
1883	}
1884
1885	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1886	    __func__, vm->vm_params.vmc_params.vcp_name,
1887	    vm->vm_ttyname, uid, gid, mode);
1888
1889	/*
1890	 * Change ownership and mode of the tty as required.
1891	 * Loosely based on the implementation of sshpty.c
1892	 */
1893	if (fstat(vm->vm_tty, &st) == -1) {
1894		log_warn("fstat failed for %s", vm->vm_ttyname);
1895		goto fail;
1896	}
1897
1898	if (st.st_uid != uid || st.st_gid != gid) {
1899		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1900			log_warn("chown %s %d %d failed, uid %d",
1901			    vm->vm_ttyname, uid, gid, getuid());
1902
1903			/* Ignore failure on read-only filesystems */
1904			if (!((errno == EROFS) &&
1905			    (st.st_uid == uid || st.st_uid == 0)))
1906				goto fail;
1907		}
1908	}
1909
1910	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1911		if (chmod(vm->vm_ttyname, mode) == -1) {
1912			log_warn("chmod %s %o failed, uid %d",
1913			    vm->vm_ttyname, mode, getuid());
1914
1915			/* Ignore failure on read-only filesystems */
1916			if (!((errno == EROFS) &&
1917			    (st.st_uid == uid || st.st_uid == 0)))
1918				goto fail;
1919		}
1920	}
1921
1922	return (0);
1923 fail:
1924	vm_closetty(vm);
1925	return (-1);
1926}
1927
1928void
1929vm_closetty(struct vmd_vm *vm)
1930{
1931	if (vm->vm_tty != -1) {
1932		/* Release and close the tty */
1933		if (fchown(vm->vm_tty, 0, 0) == -1)
1934			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1935		if (fchmod(vm->vm_tty, 0666) == -1)
1936			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1937		close(vm->vm_tty);
1938		vm->vm_tty = -1;
1939	}
1940	memset(&vm->vm_ttyname, 0, sizeof(vm->vm_ttyname));
1941}
1942
1943void
1944switch_remove(struct vmd_switch *vsw)
1945{
1946	if (vsw == NULL)
1947		return;
1948
1949	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1950
1951	free(vsw->sw_group);
1952	free(vsw->sw_name);
1953	free(vsw);
1954}
1955
1956struct vmd_switch *
1957switch_getbyname(const char *name)
1958{
1959	struct vmd_switch	*vsw;
1960
1961	if (name == NULL)
1962		return (NULL);
1963	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1964		if (strcmp(vsw->sw_name, name) == 0)
1965			return (vsw);
1966	}
1967
1968	return (NULL);
1969}
1970
1971char *
1972get_string(uint8_t *ptr, size_t len)
1973{
1974	size_t	 i;
1975
1976	for (i = 0; i < len; i++)
1977		if (!isprint((unsigned char)ptr[i]))
1978			break;
1979
1980	return strndup(ptr, i);
1981}
1982
1983uint32_t
1984prefixlen2mask(uint8_t prefixlen)
1985{
1986	if (prefixlen == 0)
1987		return (0);
1988
1989	if (prefixlen > 32)
1990		prefixlen = 32;
1991
1992	return (htonl(0xffffffff << (32 - prefixlen)));
1993}
1994
1995void
1996prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask)
1997{
1998	struct in6_addr	 s6;
1999	int		 i;
2000
2001	if (prefixlen > 128)
2002		prefixlen = 128;
2003
2004	memset(&s6, 0, sizeof(s6));
2005	for (i = 0; i < prefixlen / 8; i++)
2006		s6.s6_addr[i] = 0xff;
2007	i = prefixlen % 8;
2008	if (i)
2009		s6.s6_addr[prefixlen / 8] = 0xff00 >> i;
2010
2011	memcpy(mask, &s6, sizeof(s6));
2012}
2013
2014void
2015getmonotime(struct timeval *tv)
2016{
2017	struct timespec	 ts;
2018
2019	if (clock_gettime(CLOCK_MONOTONIC, &ts))
2020		fatal("clock_gettime");
2021
2022	TIMESPEC_TO_TIMEVAL(tv, &ts);
2023}
2024
2025static inline void
2026vm_terminate(struct vmd_vm *vm, const char *caller)
2027{
2028	if (vm->vm_from_config)
2029		vm_stop(vm, 0, caller);
2030	else {
2031		/* vm_remove calls vm_stop */
2032		vm_remove(vm, caller);
2033	}
2034}
2035
2036/*
2037 * Utility function for closing vm file descriptors. Assumes an fd of -1 was
2038 * already closed or never opened.
2039 *
2040 * Returns 0 on success, otherwise -1 on failure.
2041 */
2042int
2043close_fd(int fd)
2044{
2045	int	ret;
2046
2047	if (fd == -1)
2048		return (0);
2049
2050#ifdef POSIX_CLOSE_RESTART
2051	do { ret = close(fd); } while (ret == -1 && errno == EINTR);
2052#else
2053	ret = close(fd);
2054#endif /* POSIX_CLOSE_RESTART */
2055
2056	if (ret == -1 && errno == EIO)
2057		log_warn("%s(%d)", __func__, fd);
2058
2059	return (ret);
2060}
2061