vmd.c revision 1.150
1/*	$OpenBSD: vmd.c,v 1.150 2023/06/18 11:45:11 op Exp $	*/
2
3/*
4 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/types.h>
20#include <sys/queue.h>
21#include <sys/wait.h>
22#include <sys/stat.h>
23#include <sys/sysctl.h>
24#include <sys/tty.h>
25#include <sys/ttycom.h>
26#include <sys/ioctl.h>
27
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31#include <termios.h>
32#include <errno.h>
33#include <event.h>
34#include <fcntl.h>
35#include <pwd.h>
36#include <signal.h>
37#include <syslog.h>
38#include <unistd.h>
39#include <util.h>
40#include <ctype.h>
41#include <grp.h>
42
43#include <machine/specialreg.h>
44#include <machine/vmmvar.h>
45
46#include "proc.h"
47#include "atomicio.h"
48#include "vmd.h"
49
50__dead void usage(void);
51
52int	 main(int, char **);
53int	 vmd_configure(void);
54void	 vmd_sighdlr(int sig, short event, void *arg);
55void	 vmd_shutdown(void);
56int	 vmd_control_run(void);
57int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
58int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
59int	 vmd_dispatch_agentx(int, struct privsep_proc *, struct imsg *);
60int	 vmd_dispatch_priv(int, struct privsep_proc *, struct imsg *);
61int	 vmd_check_vmh(struct vm_dump_header *);
62
63int	 vm_instance(struct privsep *, struct vmd_vm **,
64	    struct vmop_create_params *, uid_t);
65int	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
66int	 vm_claimid(const char *, int, uint32_t *);
67void	 start_vm_batch(int, short, void*);
68
69static inline void vm_terminate(struct vmd_vm *, const char *);
70
71struct vmd	*env;
72
73static struct privsep_proc procs[] = {
74	/* Keep "priv" on top as procs[0] */
75	{ "priv",	PROC_PRIV,	vmd_dispatch_priv, priv },
76	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
77	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm,
78	  vmm_shutdown, "/" },
79	{ "agentx", 	PROC_AGENTX,	vmd_dispatch_agentx, vm_agentx,
80	  vm_agentx_shutdown, "/" }
81};
82
83enum privsep_procid privsep_process;
84
85struct event staggered_start_timer;
86
87/* For the privileged process */
88static struct privsep_proc *proc_priv = &procs[0];
89static struct passwd proc_privpw;
90static const uint8_t zero_mac[ETHER_ADDR_LEN];
91
92const char		 default_conffile[] = VMD_CONF;
93const char		*conffile = default_conffile;
94
95int
96vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
97{
98	struct privsep			*ps = p->p_ps;
99	int				 res = 0, ret = 0, cmd = 0, verbose;
100	unsigned int			 v = 0, flags;
101	struct vmop_create_params	 vmc;
102	struct vmop_id			 vid;
103	struct vmop_result		 vmr;
104	struct vm_dump_header		 vmh;
105	struct vmd_vm			*vm = NULL;
106	char				*str = NULL;
107	uint32_t			 id = 0;
108	struct control_sock		*rcs;
109
110	switch (imsg->hdr.type) {
111	case IMSG_VMDOP_START_VM_REQUEST:
112		IMSG_SIZE_CHECK(imsg, &vmc);
113		memcpy(&vmc, imsg->data, sizeof(vmc));
114		vmc.vmc_kernel = imsg->fd;
115
116		/* Try registering our VM in our list of known VMs. */
117		if (vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid)) {
118			res = errno;
119
120			/* Did we have a failure during lookup of a parent? */
121			if (vm == NULL) {
122				cmd = IMSG_VMDOP_START_VM_RESPONSE;
123				break;
124			}
125
126			/* Does the VM already exist? */
127			if (res == EALREADY) {
128				/* Is it already running? */
129				if (vm->vm_state & VM_STATE_RUNNING) {
130					cmd = IMSG_VMDOP_START_VM_RESPONSE;
131					break;
132				}
133
134				/* If not running, are our flags ok? */
135				if (vmc.vmc_flags &&
136				    vmc.vmc_flags != VMOP_CREATE_KERNEL) {
137					cmd = IMSG_VMDOP_START_VM_RESPONSE;
138					break;
139				}
140			}
141			res = 0;
142		}
143
144		/* Try to start the launch of the VM. */
145		res = config_setvm(ps, vm, imsg->hdr.peerid,
146		    vm->vm_params.vmc_owner.uid);
147		if (res)
148			cmd = IMSG_VMDOP_START_VM_RESPONSE;
149		break;
150	case IMSG_VMDOP_WAIT_VM_REQUEST:
151	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
152		IMSG_SIZE_CHECK(imsg, &vid);
153		memcpy(&vid, imsg->data, sizeof(vid));
154		flags = vid.vid_flags;
155		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
156
157		if ((id = vid.vid_id) == 0) {
158			/* Lookup vm (id) by name */
159			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
160				res = ENOENT;
161				break;
162			} else if ((vm->vm_state & VM_STATE_SHUTDOWN) &&
163			    (flags & VMOP_FORCE) == 0) {
164				res = EALREADY;
165				break;
166			} else if (!(vm->vm_state & VM_STATE_RUNNING)) {
167				res = EINVAL;
168				break;
169			}
170			id = vm->vm_vmid;
171		} else if ((vm = vm_getbyvmid(id)) == NULL) {
172			res = ENOENT;
173			break;
174		}
175		if (vm_checkperm(vm, &vm->vm_params.vmc_owner, vid.vid_uid)) {
176			res = EPERM;
177			break;
178		}
179
180		/* Only relay TERMINATION requests, not WAIT requests */
181		if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_REQUEST) {
182			memset(&vid, 0, sizeof(vid));
183			vid.vid_id = id;
184			vid.vid_flags = flags;
185
186			if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
187				imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
188				return (-1);
189		}
190		break;
191	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
192		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
193		break;
194	case IMSG_VMDOP_LOAD:
195		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
196		str = get_string((uint8_t *)imsg->data,
197		    IMSG_DATA_SIZE(imsg));
198	case IMSG_VMDOP_RELOAD:
199		if (vmd_reload(0, str) == -1)
200			cmd = IMSG_CTL_FAIL;
201		else
202			cmd = IMSG_CTL_OK;
203		free(str);
204		break;
205	case IMSG_CTL_RESET:
206		IMSG_SIZE_CHECK(imsg, &v);
207		memcpy(&v, imsg->data, sizeof(v));
208		if (vmd_reload(v, NULL) == -1)
209			cmd = IMSG_CTL_FAIL;
210		else
211			cmd = IMSG_CTL_OK;
212		break;
213	case IMSG_CTL_VERBOSE:
214		IMSG_SIZE_CHECK(imsg, &verbose);
215		memcpy(&verbose, imsg->data, sizeof(verbose));
216		log_setverbose(verbose);
217
218		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
219		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
220		cmd = IMSG_CTL_OK;
221		break;
222	case IMSG_VMDOP_PAUSE_VM:
223	case IMSG_VMDOP_UNPAUSE_VM:
224		IMSG_SIZE_CHECK(imsg, &vid);
225		memcpy(&vid, imsg->data, sizeof(vid));
226		if (vid.vid_id == 0) {
227			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
228				res = ENOENT;
229				cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
230				    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
231				    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
232				break;
233			} else {
234				vid.vid_id = vm->vm_vmid;
235			}
236		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
237			res = ENOENT;
238			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
239			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
240			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
241			break;
242		}
243		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
244		    vid.vid_uid) != 0) {
245			res = EPERM;
246			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
247			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
248			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
249			break;
250		}
251		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
252		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
253		break;
254	case IMSG_VMDOP_SEND_VM_REQUEST:
255		IMSG_SIZE_CHECK(imsg, &vid);
256		memcpy(&vid, imsg->data, sizeof(vid));
257		id = vid.vid_id;
258		if (vid.vid_id == 0) {
259			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
260				res = ENOENT;
261				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
262				close(imsg->fd);
263				break;
264			} else {
265				vid.vid_id = vm->vm_vmid;
266			}
267		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
268			res = ENOENT;
269			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
270			close(imsg->fd);
271			break;
272		}
273		vmr.vmr_id = vid.vid_id;
274		log_debug("%s: sending fd to vmm", __func__);
275		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
276		    imsg->hdr.peerid, imsg->fd, &vid, sizeof(vid));
277		break;
278	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
279		IMSG_SIZE_CHECK(imsg, &vid);
280		memcpy(&vid, imsg->data, sizeof(vid));
281		if (imsg->fd == -1) {
282			log_warnx("%s: invalid fd", __func__);
283			return (-1);
284		}
285		if (atomicio(read, imsg->fd, &vmh, sizeof(vmh)) !=
286		    sizeof(vmh)) {
287			log_warnx("%s: error reading vmh from received vm",
288			    __func__);
289			res = EIO;
290			close(imsg->fd);
291			cmd = IMSG_VMDOP_START_VM_RESPONSE;
292			break;
293		}
294
295		if (vmd_check_vmh(&vmh)) {
296			res = ENOENT;
297			close(imsg->fd);
298			cmd = IMSG_VMDOP_START_VM_RESPONSE;
299			break;
300		}
301		if (atomicio(read, imsg->fd, &vmc, sizeof(vmc)) !=
302		    sizeof(vmc)) {
303			log_warnx("%s: error reading vmc from received vm",
304			    __func__);
305			res = EIO;
306			close(imsg->fd);
307			cmd = IMSG_VMDOP_START_VM_RESPONSE;
308			break;
309		}
310		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
311		    sizeof(vmc.vmc_params.vcp_name));
312		vmc.vmc_params.vcp_id = 0;
313
314		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
315		if (ret != 0) {
316			res = errno;
317			cmd = IMSG_VMDOP_START_VM_RESPONSE;
318			close(imsg->fd);
319		} else {
320			vm->vm_state |= VM_STATE_RECEIVED;
321			config_setvm(ps, vm, imsg->hdr.peerid,
322			    vmc.vmc_owner.uid);
323			log_debug("%s: sending fd to vmm", __func__);
324			proc_compose_imsg(ps, PROC_VMM, -1,
325			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, imsg->fd,
326			    NULL, 0);
327		}
328		break;
329	case IMSG_VMDOP_DONE:
330		control_reset(&ps->ps_csock);
331		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
332			control_reset(rcs);
333		cmd = 0;
334		break;
335	default:
336		return (-1);
337	}
338
339	switch (cmd) {
340	case 0:
341		break;
342	case IMSG_VMDOP_START_VM_RESPONSE:
343	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
344		memset(&vmr, 0, sizeof(vmr));
345		vmr.vmr_result = res;
346		vmr.vmr_id = id;
347		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
348		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
349			return (-1);
350		break;
351	default:
352		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
353		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
354			return (-1);
355		break;
356	}
357
358	return (0);
359}
360
361int
362vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
363{
364	struct vmop_result	 vmr;
365	struct privsep		*ps = p->p_ps;
366	int			 res = 0;
367	struct vmd_vm		*vm;
368	struct vm_create_params	*vcp;
369	struct vmop_info_result	 vir;
370
371	switch (imsg->hdr.type) {
372	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
373		IMSG_SIZE_CHECK(imsg, &vmr);
374		memcpy(&vmr, imsg->data, sizeof(vmr));
375		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
376			break;
377		proc_compose_imsg(ps, PROC_CONTROL, -1,
378		    imsg->hdr.type, imsg->hdr.peerid, -1,
379		    imsg->data, sizeof(imsg->data));
380		log_info("%s: paused vm %d successfully",
381		    vm->vm_params.vmc_params.vcp_name,
382		    vm->vm_vmid);
383		vm->vm_state |= VM_STATE_PAUSED;
384		break;
385	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
386		IMSG_SIZE_CHECK(imsg, &vmr);
387		memcpy(&vmr, imsg->data, sizeof(vmr));
388		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
389			break;
390		proc_compose_imsg(ps, PROC_CONTROL, -1,
391		    imsg->hdr.type, imsg->hdr.peerid, -1,
392		    imsg->data, sizeof(imsg->data));
393		log_info("%s: unpaused vm %d successfully.",
394		    vm->vm_params.vmc_params.vcp_name,
395		    vm->vm_vmid);
396		vm->vm_state &= ~VM_STATE_PAUSED;
397		break;
398	case IMSG_VMDOP_START_VM_RESPONSE:
399		IMSG_SIZE_CHECK(imsg, &vmr);
400		memcpy(&vmr, imsg->data, sizeof(vmr));
401		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
402			break;
403		vm->vm_pid = vmr.vmr_pid;
404		vcp = &vm->vm_params.vmc_params;
405		vcp->vcp_id = vmr.vmr_id;
406
407		/*
408		 * If the peerid is not -1, forward the response back to the
409		 * the control socket.  If it is -1, the request originated
410		 * from the parent, not the control socket.
411		 */
412		if (vm->vm_peerid != (uint32_t)-1) {
413			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
414			    sizeof(vmr.vmr_ttyname));
415			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
416			    imsg->hdr.type, vm->vm_peerid, -1,
417			    &vmr, sizeof(vmr)) == -1) {
418				errno = vmr.vmr_result;
419				log_warn("%s: failed to forward vm result",
420				    vcp->vcp_name);
421				vm_terminate(vm, __func__);
422				return (-1);
423			}
424		}
425
426		if (vmr.vmr_result) {
427			log_warnx("%s: failed to start vm", vcp->vcp_name);
428			vm_terminate(vm, __func__);
429			errno = vmr.vmr_result;
430			break;
431		}
432
433		/* Now configure all the interfaces */
434		if (vm_priv_ifconfig(ps, vm) == -1) {
435			log_warn("%s: failed to configure vm", vcp->vcp_name);
436			vm_terminate(vm, __func__);
437			break;
438		}
439
440		log_info("%s: started vm %d successfully, tty %s",
441		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
442		break;
443	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
444		IMSG_SIZE_CHECK(imsg, &vmr);
445		memcpy(&vmr, imsg->data, sizeof(vmr));
446
447		if (vmr.vmr_result) {
448			DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
449			    __func__, vmr.vmr_id);
450			proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
451		} else {
452			if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
453				break;
454			/* Mark VM as shutting down */
455			vm->vm_state |= VM_STATE_SHUTDOWN;
456		}
457		break;
458	case IMSG_VMDOP_SEND_VM_RESPONSE:
459		IMSG_SIZE_CHECK(imsg, &vmr);
460		memcpy(&vmr, imsg->data, sizeof(vmr));
461		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
462			break;
463		if (!vmr.vmr_result) {
464			log_info("%s: sent vm %d successfully.",
465			    vm->vm_params.vmc_params.vcp_name,
466			    vm->vm_vmid);
467			vm_terminate(vm, __func__);
468		}
469
470		/* Send a response if a control client is waiting for it */
471		if (imsg->hdr.peerid != (uint32_t)-1) {
472			/* the error is meaningless for deferred responses */
473			vmr.vmr_result = 0;
474
475			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
476			    IMSG_VMDOP_SEND_VM_RESPONSE,
477			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
478				return (-1);
479		}
480		break;
481	case IMSG_VMDOP_TERMINATE_VM_EVENT:
482		IMSG_SIZE_CHECK(imsg, &vmr);
483		memcpy(&vmr, imsg->data, sizeof(vmr));
484		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
485		    __func__, vmr.vmr_id, vmr.vmr_result);
486		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
487			log_debug("%s: vm %d is no longer available",
488			    __func__, vmr.vmr_id);
489			break;
490		}
491		if (vmr.vmr_result != EAGAIN ||
492		    vm->vm_params.vmc_bootdevice) {
493			vm_terminate(vm, __func__);
494		} else {
495			/* Stop VM instance but keep the tty open */
496			vm_stop(vm, 1, __func__);
497			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
498		}
499
500		/* The error is meaningless for deferred responses */
501		vmr.vmr_result = 0;
502
503		if (proc_compose_imsg(ps, PROC_CONTROL, -1,
504			IMSG_VMDOP_TERMINATE_VM_EVENT,
505			imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
506			return (-1);
507		break;
508	case IMSG_VMDOP_GET_INFO_VM_DATA:
509		IMSG_SIZE_CHECK(imsg, &vir);
510		memcpy(&vir, imsg->data, sizeof(vir));
511		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
512			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
513			if (vm->vm_ttyname[0] != '\0')
514				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
515				    sizeof(vir.vir_ttyname));
516			log_debug("%s: running vm: %d, vm_state: 0x%x",
517			    __func__, vm->vm_vmid, vm->vm_state);
518			vir.vir_state = vm->vm_state;
519			/* get the user id who started the vm */
520			vir.vir_uid = vm->vm_uid;
521			vir.vir_gid = vm->vm_params.vmc_owner.gid;
522		}
523		if (proc_compose_imsg(ps,
524		    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
525		    PROC_AGENTX : PROC_CONTROL, -1, imsg->hdr.type,
526		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
527			log_debug("%s: GET_INFO_VM failed for vm %d, removing",
528			    __func__, vm->vm_vmid);
529			vm_terminate(vm, __func__);
530			return (-1);
531		}
532		break;
533	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
534		/*
535		 * PROC_VMM has responded with the *running* VMs, now we
536		 * append the others. These use the special value 0 for their
537		 * kernel id to indicate that they are not running.
538		 */
539		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
540			if (!(vm->vm_state & VM_STATE_RUNNING)) {
541				memset(&vir, 0, sizeof(vir));
542				vir.vir_info.vir_id = vm->vm_vmid;
543				strlcpy(vir.vir_info.vir_name,
544				    vm->vm_params.vmc_params.vcp_name,
545				    VMM_MAX_NAME_LEN);
546				vir.vir_info.vir_memory_size =
547				    vm->vm_params.vmc_params.
548				    vcp_memranges[0].vmr_size;
549				vir.vir_info.vir_ncpus =
550				    vm->vm_params.vmc_params.vcp_ncpus;
551				/* get the configured user id for this vm */
552				vir.vir_uid = vm->vm_params.vmc_owner.uid;
553				vir.vir_gid = vm->vm_params.vmc_owner.gid;
554				log_debug("%s: vm: %d, vm_state: 0x%x",
555				    __func__, vm->vm_vmid, vm->vm_state);
556				vir.vir_state = vm->vm_state;
557				if (proc_compose_imsg(ps,
558				    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
559				    PROC_AGENTX : PROC_CONTROL, -1,
560				    IMSG_VMDOP_GET_INFO_VM_DATA,
561				    imsg->hdr.peerid, -1, &vir,
562				    sizeof(vir)) == -1) {
563					log_debug("%s: GET_INFO_VM_END failed",
564					    __func__);
565					vm_terminate(vm, __func__);
566					return (-1);
567				}
568			}
569		}
570		IMSG_SIZE_CHECK(imsg, &res);
571		proc_forward_imsg(ps, imsg,
572		    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
573		    PROC_AGENTX : PROC_CONTROL, -1);
574		break;
575	default:
576		return (-1);
577	}
578
579	return (0);
580}
581
582int
583vmd_dispatch_agentx(int fd, struct privsep_proc *p, struct imsg *imsg)
584{
585	struct privsep			*ps = p->p_ps;
586
587	switch (imsg->hdr.type) {
588	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
589		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
590		return (0);
591	default:
592		break;
593	}
594	return (-1);
595}
596
597int
598vmd_dispatch_priv(int fd, struct privsep_proc *p, struct imsg *imsg)
599{
600	struct vmop_addr_result	 var;
601
602	switch (imsg->hdr.type) {
603	case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
604		IMSG_SIZE_CHECK(imsg, &var);
605		memcpy(&var, imsg->data, sizeof(var));
606		proc_forward_imsg(p->p_ps, imsg, PROC_VMM, -1);
607		break;
608	default:
609		return (-1);
610	}
611
612	return (0);
613}
614
615int
616vmd_check_vmh(struct vm_dump_header *vmh)
617{
618	int i;
619	unsigned int code, leaf;
620	unsigned int a, b, c, d;
621
622	if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
623		log_warnx("%s: incompatible dump signature", __func__);
624		return (-1);
625	}
626
627	if (vmh->vmh_version != VM_DUMP_VERSION) {
628		log_warnx("%s: incompatible dump version", __func__);
629		return (-1);
630	}
631
632	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
633		code = vmh->vmh_cpuids[i].code;
634		leaf = vmh->vmh_cpuids[i].leaf;
635		if (leaf != 0x00) {
636			log_debug("%s: invalid leaf 0x%x for code 0x%x",
637			    __func__, leaf, code);
638			return (-1);
639		}
640
641		switch (code) {
642		case 0x00:
643			CPUID_LEAF(code, leaf, a, b, c, d);
644			if (vmh->vmh_cpuids[i].a > a) {
645				log_debug("%s: incompatible cpuid level",
646				    __func__);
647				return (-1);
648			}
649			if (!(vmh->vmh_cpuids[i].b == b &&
650			    vmh->vmh_cpuids[i].c == c &&
651			    vmh->vmh_cpuids[i].d == d)) {
652				log_debug("%s: incompatible cpu brand",
653				    __func__);
654				return (-1);
655			}
656			break;
657
658		case 0x01:
659			CPUID_LEAF(code, leaf, a, b, c, d);
660			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
661			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
662				log_debug("%s: incompatible cpu features "
663				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
664				    code, leaf);
665				return (-1);
666			}
667			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
668			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
669				log_debug("%s: incompatible cpu features "
670				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
671				    code, leaf);
672				return (-1);
673			}
674			break;
675
676		case 0x07:
677			CPUID_LEAF(code, leaf, a, b, c, d);
678			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
679			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
680				log_debug("%s: incompatible cpu features "
681				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
682				    code, leaf);
683				return (-1);
684			}
685			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
686			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
687				log_debug("%s: incompatible cpu features "
688				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
689				    code, leaf);
690				return (-1);
691			}
692			break;
693
694		case 0x0d:
695			CPUID_LEAF(code, leaf, a, b, c, d);
696			if (vmh->vmh_cpuids[i].b > b) {
697				log_debug("%s: incompatible cpu: insufficient "
698				    "max save area for enabled XCR0 features",
699				    __func__);
700				return (-1);
701			}
702			if (vmh->vmh_cpuids[i].c > c) {
703				log_debug("%s: incompatible cpu: insufficient "
704				    "max save area for supported XCR0 features",
705				    __func__);
706				return (-1);
707			}
708			break;
709
710		case 0x80000001:
711			CPUID_LEAF(code, leaf, a, b, c, d);
712			if ((vmh->vmh_cpuids[i].a & a) !=
713			    vmh->vmh_cpuids[i].a) {
714				log_debug("%s: incompatible cpu features "
715				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
716				    code, leaf);
717				return (-1);
718			}
719			if ((vmh->vmh_cpuids[i].c & c) !=
720			    vmh->vmh_cpuids[i].c) {
721				log_debug("%s: incompatible cpu features "
722				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
723				    code, leaf);
724				return (-1);
725			}
726			if ((vmh->vmh_cpuids[i].d & d) !=
727			    vmh->vmh_cpuids[i].d) {
728				log_debug("%s: incompatible cpu features "
729				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
730				    code, leaf);
731				return (-1);
732			}
733			break;
734
735		default:
736			log_debug("%s: unknown code 0x%x", __func__, code);
737			return (-1);
738		}
739	}
740
741	return (0);
742}
743
744void
745vmd_sighdlr(int sig, short event, void *arg)
746{
747	if (privsep_process != PROC_PARENT)
748		return;
749	log_debug("%s: handling signal", __func__);
750
751	switch (sig) {
752	case SIGHUP:
753		log_info("%s: reload requested with SIGHUP", __func__);
754
755		/*
756		 * This is safe because libevent uses async signal handlers
757		 * that run in the event loop and not in signal context.
758		 */
759		(void)vmd_reload(0, NULL);
760		break;
761	case SIGPIPE:
762		log_info("%s: ignoring SIGPIPE", __func__);
763		break;
764	case SIGUSR1:
765		log_info("%s: ignoring SIGUSR1", __func__);
766		break;
767	case SIGTERM:
768	case SIGINT:
769		vmd_shutdown();
770		break;
771	default:
772		fatalx("unexpected signal");
773	}
774}
775
776__dead void
777usage(void)
778{
779	extern char *__progname;
780	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
781	    __progname);
782	exit(1);
783}
784
785int
786main(int argc, char **argv)
787{
788	struct privsep		*ps;
789	int			 ch;
790	enum privsep_procid	 proc_id = PROC_PARENT;
791	int			 proc_instance = 0, vm_launch = 0;
792	int			 vmm_fd = -1, vm_fd = -1;
793	const char		*errp, *title = NULL;
794	int			 argc0 = argc;
795	char			 dev_type = '\0';
796
797	log_init(0, LOG_DAEMON);
798
799	if ((env = calloc(1, sizeof(*env))) == NULL)
800		fatal("calloc: env");
801
802	while ((ch = getopt(argc, argv, "D:P:I:V:X:df:i:nt:v")) != -1) {
803		switch (ch) {
804		case 'D':
805			if (cmdline_symset(optarg) < 0)
806				log_warnx("could not parse macro definition %s",
807				    optarg);
808			break;
809		case 'd':
810			env->vmd_debug = 2;
811			break;
812		case 'f':
813			conffile = optarg;
814			break;
815		case 'v':
816			env->vmd_verbose++;
817			break;
818		/* vmd fork/exec */
819		case 'n':
820			env->vmd_noaction = 1;
821			break;
822		case 'P':
823			title = optarg;
824			proc_id = proc_getid(procs, nitems(procs), title);
825			if (proc_id == PROC_MAX)
826				fatalx("invalid process name");
827			break;
828		case 'I':
829			proc_instance = strtonum(optarg, 0,
830			    PROC_MAX_INSTANCES, &errp);
831			if (errp)
832				fatalx("invalid process instance");
833			break;
834		/* child vm and device fork/exec */
835		case 'V':
836			vm_launch = VMD_LAUNCH_VM;
837			vm_fd = strtonum(optarg, 0, 128, &errp);
838			if (errp)
839				fatalx("invalid vm fd");
840			break;
841		case 'X':
842			vm_launch = VMD_LAUNCH_DEV;
843			vm_fd = strtonum(optarg, 0, 128, &errp);
844			if (errp)
845				fatalx("invalid device fd");
846			break;
847		case 't':
848			dev_type = *optarg;
849			switch (dev_type) {
850			case VMD_DEVTYPE_NET:
851			case VMD_DEVTYPE_DISK:
852				break;
853			default: fatalx("invalid device type");
854			}
855			break;
856		case 'i':
857			vmm_fd = strtonum(optarg, 0, 128, &errp);
858			if (errp)
859				fatalx("invalid vmm fd");
860			break;
861		default:
862			usage();
863		}
864	}
865
866	argc -= optind;
867	if (argc > 0)
868		usage();
869
870	if (env->vmd_noaction && !env->vmd_debug)
871		env->vmd_debug = 1;
872
873	log_init(env->vmd_debug, LOG_DAEMON);
874	log_setverbose(env->vmd_verbose);
875
876	/* Re-exec from the vmm child process requires an absolute path. */
877	if (proc_id == PROC_PARENT && *argv[0] != '/' && !env->vmd_noaction)
878		fatalx("re-exec requires execution with an absolute path");
879	env->argv0 = argv[0];
880
881	/* check for root privileges */
882	if (env->vmd_noaction == 0 && !vm_launch) {
883		if (geteuid())
884			fatalx("need root privileges");
885	}
886
887	ps = &env->vmd_ps;
888	ps->ps_env = env;
889	env->vmd_fd = vmm_fd;
890
891	if (config_init(env) == -1)
892		fatal("failed to initialize configuration");
893
894	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
895		fatal("unknown user %s", VMD_USER);
896
897	/* First proc runs as root without pledge but in default chroot */
898	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
899	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
900
901	/*
902	 * If we're launching a new vm or its device, we short out here.
903	 */
904	if (vm_launch == VMD_LAUNCH_VM) {
905		vm_main(vm_fd, vmm_fd);
906		/* NOTREACHED */
907	} else if (vm_launch == VMD_LAUNCH_DEV) {
908		if (dev_type == VMD_DEVTYPE_NET) {
909			vionet_main(vm_fd, vmm_fd);
910			/* NOTREACHED */
911		} else if (dev_type == VMD_DEVTYPE_DISK) {
912			vioblk_main(vm_fd, vmm_fd);
913			/* NOTREACHED */
914		}
915		fatalx("unsupported device type '%c'", dev_type);
916	}
917
918	/* Open /dev/vmm early. */
919	if (env->vmd_noaction == 0 && proc_id == PROC_PARENT) {
920		env->vmd_fd = open(VMM_NODE, O_RDWR);
921		if (env->vmd_fd == -1)
922			fatal("%s", VMM_NODE);
923	}
924
925	/* Configure the control socket */
926	ps->ps_csock.cs_name = SOCKET_NAME;
927	TAILQ_INIT(&ps->ps_rcsocks);
928
929	/* Configuration will be parsed after forking the children */
930	env->vmd_conffile = conffile;
931
932	if (env->vmd_noaction)
933		ps->ps_noaction = 1;
934	ps->ps_instance = proc_instance;
935	if (title != NULL)
936		ps->ps_title[proc_id] = title;
937
938	/* only the parent returns */
939	proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv,
940	    proc_id);
941
942	log_procinit("parent");
943	if (!env->vmd_debug && daemon(0, 0) == -1)
944		fatal("can't daemonize");
945
946	if (ps->ps_noaction == 0)
947		log_info("startup");
948
949	event_init();
950
951	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
952	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
953	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
954	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
955	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
956
957	signal_add(&ps->ps_evsigint, NULL);
958	signal_add(&ps->ps_evsigterm, NULL);
959	signal_add(&ps->ps_evsighup, NULL);
960	signal_add(&ps->ps_evsigpipe, NULL);
961	signal_add(&ps->ps_evsigusr1, NULL);
962
963	if (!env->vmd_noaction)
964		proc_connect(ps);
965
966	if (vmd_configure() == -1)
967		fatalx("configuration failed");
968
969	event_dispatch();
970
971	log_debug("parent exiting");
972
973	return (0);
974}
975
976void
977start_vm_batch(int fd, short type, void *args)
978{
979	int		i = 0;
980	struct vmd_vm	*vm;
981
982	log_debug("%s: starting batch of %d vms", __func__,
983	    env->vmd_cfg.parallelism);
984	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
985		if (!(vm->vm_state & VM_STATE_WAITING)) {
986			log_debug("%s: not starting vm %s (disabled)",
987			    __func__,
988			    vm->vm_params.vmc_params.vcp_name);
989			continue;
990		}
991		i++;
992		if (i > env->vmd_cfg.parallelism) {
993			evtimer_add(&staggered_start_timer,
994			    &env->vmd_cfg.delay);
995			break;
996		}
997		vm->vm_state &= ~VM_STATE_WAITING;
998		config_setvm(&env->vmd_ps, vm, -1, vm->vm_params.vmc_owner.uid);
999	}
1000	log_debug("%s: done starting vms", __func__);
1001}
1002
1003int
1004vmd_configure(void)
1005{
1006	int			ncpus;
1007	struct vmd_switch	*vsw;
1008	int ncpu_mib[] = {CTL_HW, HW_NCPUONLINE};
1009	size_t ncpus_sz = sizeof(ncpus);
1010
1011	if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
1012		fatal("open %s", PATH_PTMDEV);
1013
1014	/*
1015	 * pledge in the parent process:
1016	 * stdio - for malloc and basic I/O including events.
1017	 * rpath - for reload to open and read the configuration files.
1018	 * wpath - for opening disk images and tap devices.
1019	 * tty - for openpty and TIOCUCNTL.
1020	 * proc - run kill to terminate its children safely.
1021	 * sendfd - for disks, interfaces and other fds.
1022	 * recvfd - for send and receive.
1023	 * getpw - lookup user or group id by name.
1024	 * chown, fattr - change tty ownership
1025	 * flock - locking disk files
1026	 */
1027	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
1028	    " chown fattr flock", NULL) == -1)
1029		fatal("pledge");
1030
1031	if (parse_config(env->vmd_conffile) == -1) {
1032		proc_kill(&env->vmd_ps);
1033		exit(1);
1034	}
1035
1036	if (env->vmd_noaction) {
1037		fprintf(stderr, "configuration OK\n");
1038		proc_kill(&env->vmd_ps);
1039		exit(0);
1040	}
1041
1042	/* Send VMM device fd to vmm proc. */
1043	proc_compose_imsg(&env->vmd_ps, PROC_VMM, -1,
1044	    IMSG_VMDOP_RECEIVE_VMM_FD, -1, env->vmd_fd, NULL, 0);
1045
1046	/* Send shared global configuration to all children */
1047	if (config_setconfig(env) == -1)
1048		return (-1);
1049
1050	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1051		if (vsw->sw_running)
1052			continue;
1053		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1054			log_warn("%s: failed to create switch %s",
1055			    __func__, vsw->sw_name);
1056			switch_remove(vsw);
1057			return (-1);
1058		}
1059	}
1060
1061	if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) {
1062		env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY;
1063		if (sysctl(ncpu_mib, nitems(ncpu_mib), &ncpus, &ncpus_sz, NULL, 0) == -1)
1064			ncpus = 1;
1065		env->vmd_cfg.parallelism = ncpus;
1066		log_debug("%s: setting staggered start configuration to "
1067		    "parallelism: %d and delay: %lld",
1068		    __func__, ncpus, (long long) env->vmd_cfg.delay.tv_sec);
1069	}
1070
1071	log_debug("%s: starting vms in staggered fashion", __func__);
1072	evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1073	/* start first batch */
1074	start_vm_batch(0, 0, NULL);
1075
1076	return (0);
1077}
1078
1079int
1080vmd_reload(unsigned int reset, const char *filename)
1081{
1082	struct vmd_vm		*vm, *next_vm;
1083	struct vmd_switch	*vsw;
1084	int			 reload = 0;
1085
1086	/* Switch back to the default config file */
1087	if (filename == NULL || *filename == '\0') {
1088		filename = env->vmd_conffile;
1089		reload = 1;
1090	}
1091
1092	log_debug("%s: level %d config file %s", __func__, reset, filename);
1093
1094	if (reset) {
1095		/* Purge the configuration */
1096		config_purge(env, reset);
1097		config_setreset(env, reset);
1098	} else {
1099		/*
1100		 * Load or reload the configuration.
1101		 *
1102		 * Reloading removes all non-running VMs before processing the
1103		 * config file, whereas loading only adds to the existing list
1104		 * of VMs.
1105		 */
1106
1107		if (reload) {
1108			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
1109			    next_vm) {
1110				if (!(vm->vm_state & VM_STATE_RUNNING)) {
1111					DPRINTF("%s: calling vm_remove",
1112					    __func__);
1113					vm_remove(vm, __func__);
1114				}
1115			}
1116		}
1117
1118		if (parse_config(filename) == -1) {
1119			log_debug("%s: failed to load config file %s",
1120			    __func__, filename);
1121			return (-1);
1122		}
1123
1124		if (reload) {
1125			/* Update shared global configuration in all children */
1126			if (config_setconfig(env) == -1)
1127				return (-1);
1128		}
1129
1130		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1131			if (vsw->sw_running)
1132				continue;
1133			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1134				log_warn("%s: failed to create switch %s",
1135				    __func__, vsw->sw_name);
1136				switch_remove(vsw);
1137				return (-1);
1138			}
1139		}
1140
1141		log_debug("%s: starting vms in staggered fashion", __func__);
1142		evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1143		/* start first batch */
1144		start_vm_batch(0, 0, NULL);
1145
1146		}
1147
1148	return (0);
1149}
1150
1151void
1152vmd_shutdown(void)
1153{
1154	struct vmd_vm *vm, *vm_next;
1155
1156	log_debug("%s: performing shutdown", __func__);
1157
1158	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
1159		vm_remove(vm, __func__);
1160	}
1161
1162	proc_kill(&env->vmd_ps);
1163	free(env);
1164
1165	log_warnx("parent terminating");
1166	exit(0);
1167}
1168
1169struct vmd_vm *
1170vm_getbyvmid(uint32_t vmid)
1171{
1172	struct vmd_vm	*vm;
1173
1174	if (vmid == 0)
1175		return (NULL);
1176	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1177		if (vm->vm_vmid == vmid)
1178			return (vm);
1179	}
1180
1181	return (NULL);
1182}
1183
1184struct vmd_vm *
1185vm_getbyid(uint32_t id)
1186{
1187	struct vmd_vm	*vm;
1188
1189	if (id == 0)
1190		return (NULL);
1191	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1192		if (vm->vm_params.vmc_params.vcp_id == id)
1193			return (vm);
1194	}
1195
1196	return (NULL);
1197}
1198
1199uint32_t
1200vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1201{
1202	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1203		return (0);
1204	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1205	    id, vm->vm_vmid);
1206	return (vm->vm_vmid);
1207}
1208
1209uint32_t
1210vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1211{
1212	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1213		return (0);
1214	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1215	    vmid, vm->vm_params.vmc_params.vcp_id);
1216	return (vm->vm_params.vmc_params.vcp_id);
1217}
1218
1219struct vmd_vm *
1220vm_getbyname(const char *name)
1221{
1222	struct vmd_vm	*vm;
1223
1224	if (name == NULL)
1225		return (NULL);
1226	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1227		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1228			return (vm);
1229	}
1230
1231	return (NULL);
1232}
1233
1234struct vmd_vm *
1235vm_getbypid(pid_t pid)
1236{
1237	struct vmd_vm	*vm;
1238
1239	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1240		if (vm->vm_pid == pid)
1241			return (vm);
1242	}
1243
1244	return (NULL);
1245}
1246
1247void
1248vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1249{
1250	struct privsep	*ps = &env->vmd_ps;
1251	unsigned int	 i, j;
1252
1253	if (vm == NULL)
1254		return;
1255
1256	log_debug("%s: %s %s stopping vm %d%s",
1257	    __func__, ps->ps_title[privsep_process], caller,
1258	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1259
1260	vm->vm_state &= ~(VM_STATE_RECEIVED | VM_STATE_RUNNING
1261	    | VM_STATE_SHUTDOWN);
1262
1263	if (vm->vm_iev.ibuf.fd != -1) {
1264		event_del(&vm->vm_iev.ev);
1265		close(vm->vm_iev.ibuf.fd);
1266	}
1267	for (i = 0; i < VM_MAX_DISKS_PER_VM; i++) {
1268		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
1269			if (vm->vm_disks[i][j] != -1) {
1270				close(vm->vm_disks[i][j]);
1271				vm->vm_disks[i][j] = -1;
1272			}
1273		}
1274	}
1275	for (i = 0; i < VM_MAX_NICS_PER_VM; i++) {
1276		if (vm->vm_ifs[i].vif_fd != -1) {
1277			close(vm->vm_ifs[i].vif_fd);
1278			vm->vm_ifs[i].vif_fd = -1;
1279		}
1280		free(vm->vm_ifs[i].vif_name);
1281		free(vm->vm_ifs[i].vif_switch);
1282		free(vm->vm_ifs[i].vif_group);
1283		vm->vm_ifs[i].vif_name = NULL;
1284		vm->vm_ifs[i].vif_switch = NULL;
1285		vm->vm_ifs[i].vif_group = NULL;
1286	}
1287	if (vm->vm_kernel != -1) {
1288		close(vm->vm_kernel);
1289		vm->vm_kernel = -1;
1290	}
1291	if (vm->vm_cdrom != -1) {
1292		close(vm->vm_cdrom);
1293		vm->vm_cdrom = -1;
1294	}
1295	if (!keeptty) {
1296		vm_closetty(vm);
1297		vm->vm_uid = 0;
1298	}
1299}
1300
1301void
1302vm_remove(struct vmd_vm *vm, const char *caller)
1303{
1304	struct privsep	*ps = &env->vmd_ps;
1305
1306	if (vm == NULL)
1307		return;
1308
1309	log_debug("%s: %s %s removing vm %d from running config",
1310	    __func__, ps->ps_title[privsep_process], caller,
1311	    vm->vm_vmid);
1312
1313	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1314
1315	vm_stop(vm, 0, caller);
1316	if (vm->vm_kernel_path != NULL && !vm->vm_from_config)
1317		free(vm->vm_kernel_path);
1318	free(vm);
1319}
1320
1321int
1322vm_claimid(const char *name, int uid, uint32_t *id)
1323{
1324	struct name2id *n2i = NULL;
1325
1326	TAILQ_FOREACH(n2i, env->vmd_known, entry)
1327		if (strcmp(n2i->name, name) == 0 && n2i->uid == uid)
1328			goto out;
1329
1330	if (++env->vmd_nvm == 0) {
1331		log_warnx("too many vms");
1332		return (-1);
1333	}
1334	if ((n2i = calloc(1, sizeof(struct name2id))) == NULL) {
1335		log_warnx("could not alloc vm name");
1336		return (-1);
1337	}
1338	n2i->id = env->vmd_nvm;
1339	n2i->uid = uid;
1340	if (strlcpy(n2i->name, name, sizeof(n2i->name)) >= sizeof(n2i->name)) {
1341		log_warnx("vm name too long");
1342		free(n2i);
1343		return (-1);
1344	}
1345	TAILQ_INSERT_TAIL(env->vmd_known, n2i, entry);
1346
1347out:
1348	*id = n2i->id;
1349	return (0);
1350}
1351
1352int
1353vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1354    struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1355{
1356	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1357	struct vm_create_params	*vcp = &vmc->vmc_params;
1358	struct vmop_owner	*vmo = NULL;
1359	uint32_t		 nid, rng;
1360	unsigned int		 i, j;
1361	struct vmd_switch	*sw;
1362	char			*s;
1363	int			 ret = 0;
1364
1365	/* Check if this is an instance of another VM */
1366	if ((ret = vm_instance(ps, &vm_parent, vmc, uid)) != 0) {
1367		errno = ret; /* XXX might set invalid errno */
1368		return (-1);
1369	}
1370
1371	errno = 0;
1372	*ret_vm = NULL;
1373
1374	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1375	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1376		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1377		    uid) != 0) {
1378			errno = EPERM;
1379			goto fail;
1380		}
1381		vm->vm_kernel = vmc->vmc_kernel;
1382		*ret_vm = vm;
1383		errno = EALREADY;
1384		goto fail;
1385	}
1386
1387	if (vm_parent != NULL)
1388		vmo = &vm_parent->vm_params.vmc_insowner;
1389
1390	/* non-root users can only start existing VMs or instances */
1391	if (vm_checkperm(NULL, vmo, uid) != 0) {
1392		log_warnx("permission denied");
1393		errno = EPERM;
1394		goto fail;
1395	}
1396	if (vmc->vmc_flags == 0) {
1397		log_warnx("invalid configuration, no devices");
1398		errno = VMD_DISK_MISSING;
1399		goto fail;
1400	}
1401	if (vcp->vcp_ncpus == 0)
1402		vcp->vcp_ncpus = 1;
1403	if (vcp->vcp_memranges[0].vmr_size == 0)
1404		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1405	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1406		log_warnx("invalid number of CPUs");
1407		goto fail;
1408	} else if (vmc->vmc_ndisks > VM_MAX_DISKS_PER_VM) {
1409		log_warnx("invalid number of disks");
1410		goto fail;
1411	} else if (vmc->vmc_nnics > VM_MAX_NICS_PER_VM) {
1412		log_warnx("invalid number of interfaces");
1413		goto fail;
1414	} else if (vmc->vmc_kernel == -1 && vmc->vmc_ndisks == 0
1415	    && strlen(vmc->vmc_cdrom) == 0) {
1416		log_warnx("no kernel or disk/cdrom specified");
1417		goto fail;
1418	} else if (strlen(vcp->vcp_name) == 0) {
1419		log_warnx("invalid VM name");
1420		goto fail;
1421	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1422	    *vcp->vcp_name == '_') {
1423		log_warnx("invalid VM name");
1424		goto fail;
1425	} else {
1426		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1427			if (!(isalnum((unsigned char)*s) || *s == '.' || \
1428			    *s == '-' || *s == '_')) {
1429				log_warnx("invalid VM name");
1430				goto fail;
1431			}
1432		}
1433	}
1434
1435	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1436		goto fail;
1437
1438	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1439	vmc = &vm->vm_params;
1440	vcp = &vmc->vmc_params;
1441	vm->vm_pid = -1;
1442	vm->vm_tty = -1;
1443	vm->vm_receive_fd = -1;
1444	vm->vm_kernel = -1;
1445	vm->vm_state &= ~VM_STATE_PAUSED;
1446
1447	if (vmc->vmc_kernel > -1)
1448		vm->vm_kernel = vmc->vmc_kernel;
1449
1450	for (i = 0; i < VM_MAX_DISKS_PER_VM; i++)
1451		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
1452			vm->vm_disks[i][j] = -1;
1453	for (i = 0; i < VM_MAX_NICS_PER_VM; i++)
1454		vm->vm_ifs[i].vif_fd = -1;
1455	for (i = 0; i < vmc->vmc_nnics; i++) {
1456		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1457			/* inherit per-interface flags from the switch */
1458			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1459		}
1460
1461		/*
1462		 * If the MAC address is zero, always randomize it in vmd(8)
1463		 * because we cannot rely on the guest OS to do the right
1464		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1465		 * from the kernel, incremented by one to differentiate
1466		 * the source.
1467		 */
1468		if (memcmp(zero_mac, &vmc->vmc_macs[i], ETHER_ADDR_LEN) == 0) {
1469			rng = arc4random();
1470			vmc->vmc_macs[i][0] = 0xfe;
1471			vmc->vmc_macs[i][1] = 0xe1;
1472			vmc->vmc_macs[i][2] = 0xba + 1;
1473			vmc->vmc_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1474			vmc->vmc_macs[i][4] = rng;
1475			vmc->vmc_macs[i][5] = rng >> 8;
1476		}
1477	}
1478	vm->vm_cdrom = -1;
1479	vm->vm_iev.ibuf.fd = -1;
1480
1481	/*
1482	 * Assign a new internal Id if not specified and we succeed in
1483	 * claiming a new Id.
1484	 */
1485	if (id != 0)
1486		vm->vm_vmid = id;
1487	else if (vm_claimid(vcp->vcp_name, uid, &nid) == -1)
1488		goto fail;
1489	else
1490		vm->vm_vmid = nid;
1491
1492	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1493	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1494
1495	*ret_vm = vm;
1496	return (0);
1497 fail:
1498	if (errno == 0)
1499		errno = EINVAL;
1500	return (-1);
1501}
1502
1503int
1504vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1505    struct vmop_create_params *vmc, uid_t uid)
1506{
1507	char			*name;
1508	struct vm_create_params	*vcp = &vmc->vmc_params;
1509	struct vmop_create_params *vmcp;
1510	struct vm_create_params	*vcpp;
1511	unsigned int		 i, j;
1512
1513	/* return without error if the parent is NULL (nothing to inherit) */
1514	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1515	    vmc->vmc_instance[0] == '\0')
1516		return (0);
1517
1518	if ((*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL) {
1519		return (VMD_PARENT_INVALID);
1520	}
1521
1522	vmcp = &(*vm_parent)->vm_params;
1523	vcpp = &vmcp->vmc_params;
1524
1525	/* Are we allowed to create an instance from this VM? */
1526	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1527		log_warnx("vm \"%s\" no permission to create vm instance",
1528		    vcpp->vcp_name);
1529		return (ENAMETOOLONG);
1530	}
1531
1532	name = vcp->vcp_name;
1533
1534	if (vm_getbyname(vcp->vcp_name) != NULL ||
1535	    vm_getbyvmid(vcp->vcp_id) != NULL) {
1536		return (EPROCLIM);
1537	}
1538
1539	/* CPU */
1540	if (vcp->vcp_ncpus == 0)
1541		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1542	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1543	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1544		log_warnx("vm \"%s\" no permission to set cpus", name);
1545		return (EPERM);
1546	}
1547
1548	/* memory */
1549	if (vcp->vcp_memranges[0].vmr_size == 0)
1550		vcp->vcp_memranges[0].vmr_size =
1551		    vcpp->vcp_memranges[0].vmr_size;
1552	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1553	    vcp->vcp_memranges[0].vmr_size !=
1554	    vcpp->vcp_memranges[0].vmr_size) {
1555		log_warnx("vm \"%s\" no permission to set memory", name);
1556		return (EPERM);
1557	}
1558
1559	/* disks cannot be inherited */
1560	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1561	    vmc->vmc_ndisks) {
1562		log_warnx("vm \"%s\" no permission to set disks", name);
1563		return (EPERM);
1564	}
1565	for (i = 0; i < vmc->vmc_ndisks; i++) {
1566		/* Check if this disk is already used in the parent */
1567		for (j = 0; j < vmcp->vmc_ndisks; j++) {
1568			if (strcmp(vmc->vmc_disks[i],
1569			    vmcp->vmc_disks[j]) == 0) {
1570				log_warnx("vm \"%s\" disk %s cannot be reused",
1571				    name, vmc->vmc_disks[i]);
1572				return (EBUSY);
1573			}
1574		}
1575		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1576	}
1577
1578	/* interfaces */
1579	if (vmc->vmc_nnics > 0 &&
1580	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1581	    vmc->vmc_nnics != vmcp->vmc_nnics) {
1582		log_warnx("vm \"%s\" no permission to set interfaces", name);
1583		return (EPERM);
1584	}
1585	for (i = 0; i < vmcp->vmc_nnics; i++) {
1586		/* Interface got overwritten */
1587		if (i < vmc->vmc_nnics)
1588			continue;
1589
1590		/* Copy interface from parent */
1591		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1592		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1593		    sizeof(vmc->vmc_ifnames[i]));
1594		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1595		    sizeof(vmc->vmc_ifswitch[i]));
1596		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1597		    sizeof(vmc->vmc_ifgroup[i]));
1598		memcpy(vmc->vmc_macs[i], vmcp->vmc_macs[i],
1599		    sizeof(vmc->vmc_macs[i]));
1600		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1601		vmc->vmc_nnics++;
1602	}
1603	for (i = 0; i < vmc->vmc_nnics; i++) {
1604		for (j = 0; j < vmcp->vmc_nnics; j++) {
1605			if (memcmp(zero_mac, vmc->vmc_macs[i],
1606			    sizeof(vmc->vmc_macs[i])) != 0 &&
1607			    memcmp(vmcp->vmc_macs[i], vmc->vmc_macs[i],
1608			    sizeof(vmc->vmc_macs[i])) != 0) {
1609				log_warnx("vm \"%s\" lladdr cannot be reused",
1610				    name);
1611				return (EBUSY);
1612			}
1613			if (strlen(vmc->vmc_ifnames[i]) &&
1614			    strcmp(vmc->vmc_ifnames[i],
1615			    vmcp->vmc_ifnames[j]) == 0) {
1616				log_warnx("vm \"%s\" %s cannot be reused",
1617				    vmc->vmc_ifnames[i], name);
1618				return (EBUSY);
1619			}
1620		}
1621	}
1622
1623	/* kernel */
1624	if (vmc->vmc_kernel > -1 || ((*vm_parent)->vm_kernel_path != NULL &&
1625		strnlen((*vm_parent)->vm_kernel_path, PATH_MAX) < PATH_MAX)) {
1626		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1627			log_warnx("vm \"%s\" no permission to set boot image",
1628			    name);
1629			return (EPERM);
1630		}
1631		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1632	}
1633
1634	/* cdrom */
1635	if (strlen(vmc->vmc_cdrom) > 0) {
1636		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1637			log_warnx("vm \"%s\" no permission to set cdrom", name);
1638			return (EPERM);
1639		}
1640		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1641	} else if (strlcpy(vmc->vmc_cdrom, vmcp->vmc_cdrom,
1642	    sizeof(vmc->vmc_cdrom)) >= sizeof(vmc->vmc_cdrom)) {
1643		log_warnx("vm \"%s\" cdrom name too long", name);
1644		return (EINVAL);
1645	}
1646
1647	/* user */
1648	if (vmc->vmc_owner.uid == 0)
1649		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1650	else if (vmc->vmc_owner.uid != uid &&
1651	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1652		log_warnx("vm \"%s\" user mismatch", name);
1653		return (EPERM);
1654	}
1655
1656	/* group */
1657	if (vmc->vmc_owner.gid == 0)
1658		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1659	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1660		log_warnx("vm \"%s\" group mismatch", name);
1661		return (EPERM);
1662	}
1663
1664	/* child instances */
1665	if (vmc->vmc_insflags) {
1666		log_warnx("vm \"%s\" cannot change instance permissions", name);
1667		return (EPERM);
1668	}
1669	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1670		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1671		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1672		vmc->vmc_insflags = vmcp->vmc_insflags;
1673	} else {
1674		vmc->vmc_insowner.gid = 0;
1675		vmc->vmc_insowner.uid = 0;
1676		vmc->vmc_insflags = 0;
1677	}
1678
1679	/* finished, remove instance flags */
1680	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1681
1682	return (0);
1683}
1684
1685/*
1686 * vm_checkperm
1687 *
1688 * Checks if the user represented by the 'uid' parameter is allowed to
1689 * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1690 * console.)
1691 *
1692 * Parameters:
1693 *  vm: the VM whose permission is to be checked
1694 *  vmo: the required uid/gid to be checked
1695 *  uid: the user ID of the user making the request
1696 *
1697 * Return values:
1698 *   0: the permission should be granted
1699 *  -1: the permission check failed (also returned if vm == null)
1700 */
1701int
1702vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1703{
1704	struct group	*gr;
1705	struct passwd	*pw;
1706	char		**grmem;
1707
1708	/* root has no restrictions */
1709	if (uid == 0)
1710		return (0);
1711
1712	if (vmo == NULL)
1713		return (-1);
1714
1715	/* check user */
1716	if (vm == NULL) {
1717		if  (vmo->uid == uid)
1718			return (0);
1719	} else {
1720		/*
1721		 * check user of running vm (the owner of a running vm can
1722		 * be different to (or more specific than) the configured owner.
1723		 */
1724		if (((vm->vm_state & VM_STATE_RUNNING) && vm->vm_uid == uid) ||
1725		    (!(vm->vm_state & VM_STATE_RUNNING) && vmo->uid == uid))
1726			return (0);
1727	}
1728
1729	/* check groups */
1730	if (vmo->gid != -1) {
1731		if ((pw = getpwuid(uid)) == NULL)
1732			return (-1);
1733		if (pw->pw_gid == vmo->gid)
1734			return (0);
1735		if ((gr = getgrgid(vmo->gid)) != NULL) {
1736			for (grmem = gr->gr_mem; *grmem; grmem++)
1737				if (strcmp(*grmem, pw->pw_name) == 0)
1738					return (0);
1739		}
1740	}
1741
1742	return (-1);
1743}
1744
1745/*
1746 * vm_checkinsflag
1747 *
1748 * Checks whether the non-root user is allowed to set an instance option.
1749 *
1750 * Parameters:
1751 *  vmc: the VM create parameters
1752 *  flag: the flag to be checked
1753 *  uid: the user ID of the user making the request
1754 *
1755 * Return values:
1756 *   0: the permission should be granted
1757 *  -1: the permission check failed (also returned if vm == null)
1758 */
1759int
1760vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1761{
1762	/* root has no restrictions */
1763	if (uid == 0)
1764		return (0);
1765
1766	if ((vmc->vmc_insflags & flag) == 0)
1767		return (-1);
1768
1769	return (0);
1770}
1771
1772/*
1773 * vm_checkaccess
1774 *
1775 * Checks if the user represented by the 'uid' parameter is allowed to
1776 * access the file described by the 'path' parameter.
1777 *
1778 * Parameters:
1779 *  fd: the file descriptor of the opened file
1780 *  uflag: check if the userid has access to the file
1781 *  uid: the user ID of the user making the request
1782 *  amode: the access flags of R_OK and W_OK
1783 *
1784 * Return values:
1785 *   0: the permission should be granted
1786 *  -1: the permission check failed
1787 */
1788int
1789vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1790{
1791	struct group	*gr;
1792	struct passwd	*pw;
1793	char		**grmem;
1794	struct stat	 st;
1795	mode_t		 mode;
1796
1797	if (fd == -1)
1798		return (-1);
1799
1800	/*
1801	 * File has to be accessible and a regular file
1802	 */
1803	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1804		return (-1);
1805
1806	/* root has no restrictions */
1807	if (uid == 0 || uflag == 0)
1808		return (0);
1809
1810	/* check other */
1811	mode = amode & W_OK ? S_IWOTH : 0;
1812	mode |= amode & R_OK ? S_IROTH : 0;
1813	if ((st.st_mode & mode) == mode)
1814		return (0);
1815
1816	/* check user */
1817	mode = amode & W_OK ? S_IWUSR : 0;
1818	mode |= amode & R_OK ? S_IRUSR : 0;
1819	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1820		return (0);
1821
1822	/* check groups */
1823	mode = amode & W_OK ? S_IWGRP : 0;
1824	mode |= amode & R_OK ? S_IRGRP : 0;
1825	if ((st.st_mode & mode) != mode)
1826		return (-1);
1827	if ((pw = getpwuid(uid)) == NULL)
1828		return (-1);
1829	if (pw->pw_gid == st.st_gid)
1830		return (0);
1831	if ((gr = getgrgid(st.st_gid)) != NULL) {
1832		for (grmem = gr->gr_mem; *grmem; grmem++)
1833			if (strcmp(*grmem, pw->pw_name) == 0)
1834				return (0);
1835	}
1836
1837	return (-1);
1838}
1839
1840int
1841vm_opentty(struct vmd_vm *vm)
1842{
1843	struct ptmget		 ptm;
1844	struct stat		 st;
1845	struct group		*gr;
1846	uid_t			 uid;
1847	gid_t			 gid;
1848	mode_t			 mode;
1849	int			 on;
1850
1851	/*
1852	 * Open tty with pre-opened PTM fd
1853	 */
1854	if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1))
1855		return (-1);
1856
1857	/*
1858	 * We use user ioctl(2) mode to pass break commands.
1859	 */
1860	on = 1;
1861	if (ioctl(ptm.cfd, TIOCUCNTL, &on) == -1)
1862		fatal("could not enable user ioctl mode");
1863
1864	vm->vm_tty = ptm.cfd;
1865	close(ptm.sfd);
1866	if (strlcpy(vm->vm_ttyname, ptm.sn, sizeof(vm->vm_ttyname))
1867	    >= sizeof(vm->vm_ttyname)) {
1868		log_warnx("%s: truncated ttyname", __func__);
1869		goto fail;
1870	}
1871
1872	uid = vm->vm_uid;
1873	gid = vm->vm_params.vmc_owner.gid;
1874
1875	if (vm->vm_params.vmc_owner.gid != -1) {
1876		mode = 0660;
1877	} else if ((gr = getgrnam("tty")) != NULL) {
1878		gid = gr->gr_gid;
1879		mode = 0620;
1880	} else {
1881		mode = 0600;
1882		gid = 0;
1883	}
1884
1885	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1886	    __func__, vm->vm_params.vmc_params.vcp_name,
1887	    vm->vm_ttyname, uid, gid, mode);
1888
1889	/*
1890	 * Change ownership and mode of the tty as required.
1891	 * Loosely based on the implementation of sshpty.c
1892	 */
1893	if (stat(vm->vm_ttyname, &st) == -1)
1894		goto fail;
1895
1896	if (st.st_uid != uid || st.st_gid != gid) {
1897		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1898			log_warn("chown %s %d %d failed, uid %d",
1899			    vm->vm_ttyname, uid, gid, getuid());
1900
1901			/* Ignore failure on read-only filesystems */
1902			if (!((errno == EROFS) &&
1903			    (st.st_uid == uid || st.st_uid == 0)))
1904				goto fail;
1905		}
1906	}
1907
1908	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1909		if (chmod(vm->vm_ttyname, mode) == -1) {
1910			log_warn("chmod %s %o failed, uid %d",
1911			    vm->vm_ttyname, mode, getuid());
1912
1913			/* Ignore failure on read-only filesystems */
1914			if (!((errno == EROFS) &&
1915			    (st.st_uid == uid || st.st_uid == 0)))
1916				goto fail;
1917		}
1918	}
1919
1920	return (0);
1921 fail:
1922	vm_closetty(vm);
1923	return (-1);
1924}
1925
1926void
1927vm_closetty(struct vmd_vm *vm)
1928{
1929	if (vm->vm_tty != -1) {
1930		/* Release and close the tty */
1931		if (fchown(vm->vm_tty, 0, 0) == -1)
1932			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1933		if (fchmod(vm->vm_tty, 0666) == -1)
1934			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1935		close(vm->vm_tty);
1936		vm->vm_tty = -1;
1937	}
1938	memset(&vm->vm_ttyname, 0, sizeof(vm->vm_ttyname));
1939}
1940
1941void
1942switch_remove(struct vmd_switch *vsw)
1943{
1944	if (vsw == NULL)
1945		return;
1946
1947	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1948
1949	free(vsw->sw_group);
1950	free(vsw->sw_name);
1951	free(vsw);
1952}
1953
1954struct vmd_switch *
1955switch_getbyname(const char *name)
1956{
1957	struct vmd_switch	*vsw;
1958
1959	if (name == NULL)
1960		return (NULL);
1961	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1962		if (strcmp(vsw->sw_name, name) == 0)
1963			return (vsw);
1964	}
1965
1966	return (NULL);
1967}
1968
1969char *
1970get_string(uint8_t *ptr, size_t len)
1971{
1972	size_t	 i;
1973
1974	for (i = 0; i < len; i++)
1975		if (!isprint((unsigned char)ptr[i]))
1976			break;
1977
1978	return strndup(ptr, i);
1979}
1980
1981uint32_t
1982prefixlen2mask(uint8_t prefixlen)
1983{
1984	if (prefixlen == 0)
1985		return (0);
1986
1987	if (prefixlen > 32)
1988		prefixlen = 32;
1989
1990	return (htonl(0xffffffff << (32 - prefixlen)));
1991}
1992
1993void
1994prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask)
1995{
1996	struct in6_addr	 s6;
1997	int		 i;
1998
1999	if (prefixlen > 128)
2000		prefixlen = 128;
2001
2002	memset(&s6, 0, sizeof(s6));
2003	for (i = 0; i < prefixlen / 8; i++)
2004		s6.s6_addr[i] = 0xff;
2005	i = prefixlen % 8;
2006	if (i)
2007		s6.s6_addr[prefixlen / 8] = 0xff00 >> i;
2008
2009	memcpy(mask, &s6, sizeof(s6));
2010}
2011
2012void
2013getmonotime(struct timeval *tv)
2014{
2015	struct timespec	 ts;
2016
2017	if (clock_gettime(CLOCK_MONOTONIC, &ts))
2018		fatal("clock_gettime");
2019
2020	TIMESPEC_TO_TIMEVAL(tv, &ts);
2021}
2022
2023static inline void
2024vm_terminate(struct vmd_vm *vm, const char *caller)
2025{
2026	if (vm->vm_from_config)
2027		vm_stop(vm, 0, caller);
2028	else {
2029		/* vm_remove calls vm_stop */
2030		vm_remove(vm, caller);
2031	}
2032}
2033
2034/*
2035 * Utility function for closing vm file descriptors. Assumes an fd of -1 was
2036 * already closed or never opened.
2037 *
2038 * Returns 0 on success, otherwise -1 on failure.
2039 */
2040int
2041close_fd(int fd)
2042{
2043	int	ret;
2044
2045	if (fd == -1)
2046		return (0);
2047
2048#ifdef POSIX_CLOSE_RESTART
2049	do { ret = close(fd); } while (ret == -1 && errno == EINTR);
2050#else
2051	ret = close(fd);
2052#endif /* POSIX_CLOSE_RESTART */
2053
2054	if (ret == -1 && errno == EIO)
2055		log_warn("%s(%d)", __func__, fd);
2056
2057	return (ret);
2058}
2059