vmd.c revision 1.127
1112158Sdas/*	$OpenBSD: vmd.c,v 1.127 2021/11/29 05:17:35 deraadt Exp $	*/
2112158Sdas
3112158Sdas/*
4112158Sdas * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5112158Sdas *
6112158Sdas * Permission to use, copy, modify, and distribute this software for any
7112158Sdas * purpose with or without fee is hereby granted, provided that the above
8112158Sdas * copyright notice and this permission notice appear in all copies.
9112158Sdas *
10112158Sdas * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11112158Sdas * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12112158Sdas * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13112158Sdas * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14112158Sdas * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15112158Sdas * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16112158Sdas * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17112158Sdas */
18112158Sdas
19112158Sdas#include <sys/types.h>
20112158Sdas#include <sys/queue.h>
21112158Sdas#include <sys/wait.h>
22112158Sdas#include <sys/cdefs.h>
23112158Sdas#include <sys/stat.h>
24112158Sdas#include <sys/sysctl.h>
25112158Sdas#include <sys/tty.h>
26112158Sdas#include <sys/ttycom.h>
27112158Sdas#include <sys/ioctl.h>
28112158Sdas
29165743Sdas#include <stdio.h>
30165743Sdas#include <stdlib.h>
31112158Sdas#include <string.h>
32112158Sdas#include <termios.h>
33112158Sdas#include <errno.h>
34112158Sdas#include <event.h>
35112158Sdas#include <fcntl.h>
36112158Sdas#include <pwd.h>
37165743Sdas#include <signal.h>
38112158Sdas#include <syslog.h>
39112158Sdas#include <unistd.h>
40112158Sdas#include <util.h>
41112158Sdas#include <ctype.h>
42112158Sdas#include <pwd.h>
43112158Sdas#include <grp.h>
44112158Sdas
45112158Sdas#include <machine/specialreg.h>
46112158Sdas#include <machine/vmmvar.h>
47112158Sdas
48112158Sdas#include "proc.h"
49112158Sdas#include "atomicio.h"
50112158Sdas#include "vmd.h"
51112158Sdas
52112158Sdas__dead void usage(void);
53112158Sdas
54112158Sdasint	 main(int, char **);
55112158Sdasint	 vmd_configure(void);
56112158Sdasvoid	 vmd_sighdlr(int sig, short event, void *arg);
57112158Sdasvoid	 vmd_shutdown(void);
58112158Sdasint	 vmd_control_run(void);
59112158Sdasint	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
60112158Sdasint	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
61112158Sdasint	 vmd_dispatch_priv(int, struct privsep_proc *, struct imsg *);
62112158Sdasint	 vmd_check_vmh(struct vm_dump_header *);
63112158Sdas
64112158Sdasint	 vm_instance(struct privsep *, struct vmd_vm **,
65112158Sdas	    struct vmop_create_params *, uid_t);
66112158Sdasint	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
67112158Sdasint	 vm_claimid(const char *, int, uint32_t *);
68112158Sdasvoid	 start_vm_batch(int, short, void*);
69112158Sdas
70112158Sdasstruct vmd	*env;
71112158Sdas
72112158Sdasstatic struct privsep_proc procs[] = {
73112158Sdas	/* Keep "priv" on top as procs[0] */
74112158Sdas	{ "priv",	PROC_PRIV,	vmd_dispatch_priv, priv },
75112158Sdas	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
76112158Sdas	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm, vmm_shutdown },
77112158Sdas};
78219557Sdas
79219557Sdasenum privsep_procid privsep_process;
80112158Sdas
81219557Sdasstruct event staggered_start_timer;
82112158Sdas
83112158Sdas/* For the privileged process */
84112158Sdasstatic struct privsep_proc *proc_priv = &procs[0];
85112158Sdasstatic struct passwd proc_privpw;
86112158Sdasstatic const uint8_t zero_mac[ETHER_ADDR_LEN];
87112158Sdas
88112158Sdasint
89112158Sdasvmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
90112158Sdas{
91112158Sdas	struct privsep			*ps = p->p_ps;
92112158Sdas	int				 res = 0, ret = 0, cmd = 0, verbose;
93112158Sdas	unsigned int			 v = 0, flags;
94112158Sdas	struct vmop_create_params	 vmc;
95112158Sdas	struct vmop_id			 vid;
96112158Sdas	struct vmop_result		 vmr;
97112158Sdas	struct vm_dump_header		 vmh;
98112158Sdas	struct vmd_vm			*vm = NULL;
99112158Sdas	char				*str = NULL;
100112158Sdas	uint32_t			 id = 0;
101112158Sdas	struct control_sock		*rcs;
102112158Sdas
103112158Sdas	switch (imsg->hdr.type) {
104112158Sdas	case IMSG_VMDOP_START_VM_REQUEST:
105112158Sdas		IMSG_SIZE_CHECK(imsg, &vmc);
106112158Sdas		memcpy(&vmc, imsg->data, sizeof(vmc));
107112158Sdas		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
108112158Sdas		if (vmc.vmc_flags == 0) {
109112158Sdas			/* start an existing VM with pre-configured options */
110112158Sdas			if (!(ret == -1 && errno == EALREADY &&
111112158Sdas			    !(vm->vm_state & VM_STATE_RUNNING))) {
112112158Sdas				res = errno;
113112158Sdas				cmd = IMSG_VMDOP_START_VM_RESPONSE;
114112158Sdas			}
115112158Sdas		} else if (ret != 0) {
116112158Sdas			res = errno;
117112158Sdas			cmd = IMSG_VMDOP_START_VM_RESPONSE;
118112158Sdas		}
119112158Sdas		if (res == 0) {
120112158Sdas			res = config_setvm(ps, vm, imsg->hdr.peerid,
121112158Sdas			    vm->vm_params.vmc_owner.uid);
122112158Sdas			if (res)
123112158Sdas				cmd = IMSG_VMDOP_START_VM_RESPONSE;
124112158Sdas		}
125112158Sdas		break;
126112158Sdas	case IMSG_VMDOP_WAIT_VM_REQUEST:
127219557Sdas	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
128219557Sdas		IMSG_SIZE_CHECK(imsg, &vid);
129112158Sdas		memcpy(&vid, imsg->data, sizeof(vid));
130112158Sdas		flags = vid.vid_flags;
131112158Sdas		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
132112158Sdas
133182709Sdas		if ((id = vid.vid_id) == 0) {
134182709Sdas			/* Lookup vm (id) by name */
135182709Sdas			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
136182709Sdas				res = ENOENT;
137182709Sdas				break;
138182709Sdas			} else if ((vm->vm_state & VM_STATE_SHUTDOWN) &&
139182709Sdas			    (flags & VMOP_FORCE) == 0) {
140182709Sdas				res = EALREADY;
141182709Sdas				break;
142182709Sdas			} else if (!(vm->vm_state & VM_STATE_RUNNING)) {
143182709Sdas				res = EINVAL;
144182709Sdas				break;
145182709Sdas			}
146112158Sdas			id = vm->vm_vmid;
147112158Sdas		} else if ((vm = vm_getbyvmid(id)) == NULL) {
148112158Sdas			res = ENOENT;
149112158Sdas			break;
150112158Sdas		}
151112158Sdas		if (vm_checkperm(vm, &vm->vm_params.vmc_owner, vid.vid_uid)) {
152112158Sdas			res = EPERM;
153219557Sdas			break;
154219557Sdas		}
155112158Sdas
156112158Sdas		/* Only relay TERMINATION requests, not WAIT requests */
157219557Sdas		if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_REQUEST) {
158112158Sdas			memset(&vid, 0, sizeof(vid));
159112158Sdas			vid.vid_id = id;
160112158Sdas			vid.vid_flags = flags;
161112158Sdas
162112158Sdas			if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
163112158Sdas				imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
164219557Sdas				return (-1);
165112158Sdas		}
166219557Sdas		break;
167112158Sdas	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
168112158Sdas		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
169112158Sdas		break;
170112158Sdas	case IMSG_VMDOP_LOAD:
171112158Sdas		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
172219557Sdas		str = get_string((uint8_t *)imsg->data,
173112158Sdas		    IMSG_DATA_SIZE(imsg));
174112158Sdas	case IMSG_VMDOP_RELOAD:
175112158Sdas		if (vmd_reload(0, str) == -1)
176112158Sdas			cmd = IMSG_CTL_FAIL;
177112158Sdas		else
178112158Sdas			cmd = IMSG_CTL_OK;
179219557Sdas		free(str);
180112158Sdas		break;
181219557Sdas	case IMSG_CTL_RESET:
182112158Sdas		IMSG_SIZE_CHECK(imsg, &v);
183112158Sdas		memcpy(&v, imsg->data, sizeof(v));
184112158Sdas		if (vmd_reload(v, NULL) == -1)
185112158Sdas			cmd = IMSG_CTL_FAIL;
186112158Sdas		else
187112158Sdas			cmd = IMSG_CTL_OK;
188112158Sdas		break;
189112158Sdas	case IMSG_CTL_VERBOSE:
190112158Sdas		IMSG_SIZE_CHECK(imsg, &verbose);
191182709Sdas		memcpy(&verbose, imsg->data, sizeof(verbose));
192112158Sdas		log_setverbose(verbose);
193182709Sdas
194112158Sdas		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
195182709Sdas		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
196182709Sdas		cmd = IMSG_CTL_OK;
197112158Sdas		break;
198112158Sdas	case IMSG_VMDOP_PAUSE_VM:
199112158Sdas	case IMSG_VMDOP_UNPAUSE_VM:
200219557Sdas		IMSG_SIZE_CHECK(imsg, &vid);
201112158Sdas		memcpy(&vid, imsg->data, sizeof(vid));
202219557Sdas		if (vid.vid_id == 0) {
203112158Sdas			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
204219557Sdas				res = ENOENT;
205112158Sdas				cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
206219557Sdas				    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
207219557Sdas				    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
208219557Sdas				break;
209112158Sdas			} else {
210219557Sdas				vid.vid_id = vm->vm_vmid;
211219557Sdas			}
212112158Sdas		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
213112158Sdas			res = ENOENT;
214112158Sdas			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
215112158Sdas			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
216112158Sdas			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
217219557Sdas			break;
218112158Sdas		}
219219557Sdas		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
220112158Sdas		    vid.vid_uid) != 0) {
221112158Sdas			res = EPERM;
222112158Sdas			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
223112158Sdas			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
224112158Sdas			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
225112158Sdas			break;
226112158Sdas		}
227112158Sdas		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
228112158Sdas		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
229112158Sdas		break;
230112158Sdas	case IMSG_VMDOP_SEND_VM_REQUEST:
231112158Sdas		IMSG_SIZE_CHECK(imsg, &vid);
232112158Sdas		memcpy(&vid, imsg->data, sizeof(vid));
233112158Sdas		id = vid.vid_id;
234112158Sdas		if (vid.vid_id == 0) {
235112158Sdas			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
236112158Sdas				res = ENOENT;
237112158Sdas				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
238112158Sdas				close(imsg->fd);
239112158Sdas				break;
240112158Sdas			} else {
241112158Sdas				vid.vid_id = vm->vm_vmid;
242112158Sdas			}
243112158Sdas		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
244112158Sdas			res = ENOENT;
245112158Sdas			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
246112158Sdas			close(imsg->fd);
247112158Sdas			break;
248219557Sdas		}
249219557Sdas		vmr.vmr_id = vid.vid_id;
250219557Sdas		log_debug("%s: sending fd to vmm", __func__);
251219557Sdas		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
252112158Sdas		    imsg->hdr.peerid, imsg->fd, &vid, sizeof(vid));
253112158Sdas		break;
254112158Sdas	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
255112158Sdas		IMSG_SIZE_CHECK(imsg, &vid);
256219557Sdas		memcpy(&vid, imsg->data, sizeof(vid));
257112158Sdas		if (imsg->fd == -1) {
258112158Sdas			log_warnx("%s: invalid fd", __func__);
259112158Sdas			return (-1);
260112158Sdas		}
261112158Sdas		if (atomicio(read, imsg->fd, &vmh, sizeof(vmh)) !=
262219557Sdas		    sizeof(vmh)) {
263112158Sdas			log_warnx("%s: error reading vmh from received vm",
264112158Sdas			    __func__);
265112158Sdas			res = EIO;
266112158Sdas			close(imsg->fd);
267112158Sdas			cmd = IMSG_VMDOP_START_VM_RESPONSE;
268112158Sdas			break;
269112158Sdas		}
270112158Sdas
271112158Sdas		if (vmd_check_vmh(&vmh)) {
272112158Sdas			res = ENOENT;
273112158Sdas			close(imsg->fd);
274112158Sdas			cmd = IMSG_VMDOP_START_VM_RESPONSE;
275112158Sdas			break;
276112158Sdas		}
277112158Sdas		if (atomicio(read, imsg->fd, &vmc, sizeof(vmc)) !=
278112158Sdas		    sizeof(vmc)) {
279112158Sdas			log_warnx("%s: error reading vmc from received vm",
280112158Sdas			    __func__);
281112158Sdas			res = EIO;
282112158Sdas			close(imsg->fd);
283112158Sdas			cmd = IMSG_VMDOP_START_VM_RESPONSE;
284112158Sdas			break;
285112158Sdas		}
286112158Sdas		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
287112158Sdas		    sizeof(vmc.vmc_params.vcp_name));
288112158Sdas		vmc.vmc_params.vcp_id = 0;
289112158Sdas
290112158Sdas		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
291112158Sdas		if (ret != 0) {
292112158Sdas			res = errno;
293112158Sdas			cmd = IMSG_VMDOP_START_VM_RESPONSE;
294112158Sdas			close(imsg->fd);
295112158Sdas		} else {
296112158Sdas			vm->vm_state |= VM_STATE_RECEIVED;
297112158Sdas			config_setvm(ps, vm, imsg->hdr.peerid,
298112158Sdas			    vmc.vmc_owner.uid);
299112158Sdas			log_debug("%s: sending fd to vmm", __func__);
300112158Sdas			proc_compose_imsg(ps, PROC_VMM, -1,
301219557Sdas			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, imsg->fd,
302219557Sdas			    NULL, 0);
303112158Sdas		}
304112158Sdas		break;
305112158Sdas	case IMSG_VMDOP_DONE:
306112158Sdas		control_reset(&ps->ps_csock);
307112158Sdas		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
308112158Sdas			control_reset(rcs);
309112158Sdas		cmd = 0;
310112158Sdas		break;
311112158Sdas	default:
312112158Sdas		return (-1);
313112158Sdas	}
314112158Sdas
315112158Sdas	switch (cmd) {
316112158Sdas	case 0:
317112158Sdas		break;
318112158Sdas	case IMSG_VMDOP_START_VM_RESPONSE:
319112158Sdas	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
320112158Sdas		memset(&vmr, 0, sizeof(vmr));
321112158Sdas		vmr.vmr_result = res;
322112158Sdas		vmr.vmr_id = id;
323112158Sdas		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
324112158Sdas		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
325112158Sdas			return (-1);
326112158Sdas		break;
327112158Sdas	default:
328112158Sdas		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
329112158Sdas		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
330182709Sdas			return (-1);
331112158Sdas		break;
332112158Sdas	}
333112158Sdas
334112158Sdas	return (0);
335112158Sdas}
336112158Sdas
337112158Sdasint
338112158Sdasvmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
339219557Sdas{
340112158Sdas	struct vmop_result	 vmr;
341112158Sdas	struct privsep		*ps = p->p_ps;
342112158Sdas	int			 res = 0;
343112158Sdas	struct vmd_vm		*vm;
344112158Sdas	struct vm_create_params	*vcp;
345112158Sdas	struct vmop_info_result	 vir;
346112158Sdas
347112158Sdas	switch (imsg->hdr.type) {
348112158Sdas	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
349219557Sdas		IMSG_SIZE_CHECK(imsg, &vmr);
350112158Sdas		memcpy(&vmr, imsg->data, sizeof(vmr));
351112158Sdas		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
352112158Sdas			break;
353112158Sdas		proc_compose_imsg(ps, PROC_CONTROL, -1,
354112158Sdas		    imsg->hdr.type, imsg->hdr.peerid, -1,
355112158Sdas		    imsg->data, sizeof(imsg->data));
356112158Sdas		log_info("%s: paused vm %d successfully",
357219557Sdas		    vm->vm_params.vmc_params.vcp_name,
358112158Sdas		    vm->vm_vmid);
359112158Sdas		vm->vm_state |= VM_STATE_PAUSED;
360219557Sdas		break;
361112158Sdas	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
362112158Sdas		IMSG_SIZE_CHECK(imsg, &vmr);
363112158Sdas		memcpy(&vmr, imsg->data, sizeof(vmr));
364219557Sdas		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
365112158Sdas			break;
366112158Sdas		proc_compose_imsg(ps, PROC_CONTROL, -1,
367219557Sdas		    imsg->hdr.type, imsg->hdr.peerid, -1,
368112158Sdas		    imsg->data, sizeof(imsg->data));
369112158Sdas		log_info("%s: unpaused vm %d successfully.",
370112158Sdas		    vm->vm_params.vmc_params.vcp_name,
371112158Sdas		    vm->vm_vmid);
372219557Sdas		vm->vm_state &= ~VM_STATE_PAUSED;
373112158Sdas		break;
374112158Sdas	case IMSG_VMDOP_START_VM_RESPONSE:
375219557Sdas		IMSG_SIZE_CHECK(imsg, &vmr);
376219557Sdas		memcpy(&vmr, imsg->data, sizeof(vmr));
377112158Sdas		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
378112158Sdas			break;
379219557Sdas		vm->vm_pid = vmr.vmr_pid;
380219557Sdas		vcp = &vm->vm_params.vmc_params;
381112158Sdas		vcp->vcp_id = vmr.vmr_id;
382219557Sdas
383112158Sdas		/*
384112158Sdas		 * If the peerid is not -1, forward the response back to the
385112158Sdas		 * the control socket.  If it is -1, the request originated
386112158Sdas		 * from the parent, not the control socket.
387112158Sdas		 */
388112158Sdas		if (vm->vm_peerid != (uint32_t)-1) {
389112158Sdas			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
390112158Sdas			    sizeof(vmr.vmr_ttyname));
391219557Sdas			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
392112158Sdas			    imsg->hdr.type, vm->vm_peerid, -1,
393219557Sdas			    &vmr, sizeof(vmr)) == -1) {
394219557Sdas				errno = vmr.vmr_result;
395112158Sdas				log_warn("%s: failed to foward vm result",
396219557Sdas				    vcp->vcp_name);
397112158Sdas				vm_remove(vm, __func__);
398219557Sdas				return (-1);
399112158Sdas			}
400112158Sdas		}
401112158Sdas
402219557Sdas		if (vmr.vmr_result) {
403219557Sdas			errno = vmr.vmr_result;
404112158Sdas			log_warn("%s: failed to start vm", vcp->vcp_name);
405112158Sdas			vm_remove(vm, __func__);
406112158Sdas			break;
407112158Sdas		}
408112158Sdas
409219557Sdas		/* Now configure all the interfaces */
410219557Sdas		if (vm_priv_ifconfig(ps, vm) == -1) {
411219557Sdas			log_warn("%s: failed to configure vm", vcp->vcp_name);
412219557Sdas			vm_remove(vm, __func__);
413112158Sdas			break;
414112158Sdas		}
415112158Sdas
416219557Sdas		log_info("%s: started vm %d successfully, tty %s",
417112158Sdas		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
418219557Sdas		break;
419112158Sdas	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
420112158Sdas		IMSG_SIZE_CHECK(imsg, &vmr);
421112158Sdas		memcpy(&vmr, imsg->data, sizeof(vmr));
422112158Sdas
423112158Sdas		if (vmr.vmr_result) {
424112158Sdas			DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
425112158Sdas			    __func__, vmr.vmr_id);
426112158Sdas			proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
427112158Sdas		} else {
428112158Sdas			if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
429112158Sdas				break;
430112158Sdas			/* Mark VM as shutting down */
431219557Sdas			vm->vm_state |= VM_STATE_SHUTDOWN;
432112158Sdas		}
433112158Sdas		break;
434112158Sdas	case IMSG_VMDOP_SEND_VM_RESPONSE:
435112158Sdas		IMSG_SIZE_CHECK(imsg, &vmr);
436112158Sdas		memcpy(&vmr, imsg->data, sizeof(vmr));
437112158Sdas		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
438112158Sdas			break;
439112158Sdas		if (!vmr.vmr_result) {
440112158Sdas			log_info("%s: sent vm %d successfully.",
441112158Sdas			    vm->vm_params.vmc_params.vcp_name,
442112158Sdas			    vm->vm_vmid);
443219557Sdas			if (vm->vm_from_config)
444112158Sdas				vm_stop(vm, 0, __func__);
445112158Sdas			else
446112158Sdas				vm_remove(vm, __func__);
447219557Sdas		}
448219557Sdas
449219557Sdas		/* Send a response if a control client is waiting for it */
450112158Sdas		if (imsg->hdr.peerid != (uint32_t)-1) {
451112158Sdas			/* the error is meaningless for deferred responses */
452219557Sdas			vmr.vmr_result = 0;
453112158Sdas
454219557Sdas			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
455112158Sdas			    IMSG_VMDOP_SEND_VM_RESPONSE,
456112158Sdas			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
457112158Sdas				return (-1);
458219557Sdas		}
459112158Sdas		break;
460112158Sdas	case IMSG_VMDOP_TERMINATE_VM_EVENT:
461112158Sdas		IMSG_SIZE_CHECK(imsg, &vmr);
462112158Sdas		memcpy(&vmr, imsg->data, sizeof(vmr));
463112158Sdas		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
464112158Sdas		    __func__, vmr.vmr_id, vmr.vmr_result);
465112158Sdas		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
466112158Sdas			log_debug("%s: vm %d is no longer available",
467182709Sdas			    __func__, vmr.vmr_id);
468112158Sdas			break;
469112158Sdas		}
470112158Sdas		if (vmr.vmr_result != EAGAIN ||
471112158Sdas		    vm->vm_params.vmc_bootdevice) {
472219557Sdas			if (vm->vm_from_config)
473219557Sdas				vm_stop(vm, 0, __func__);
474219557Sdas			else
475219557Sdas				vm_remove(vm, __func__);
476219557Sdas		} else {
477219557Sdas			/* Stop VM instance but keep the tty open */
478219557Sdas			vm_stop(vm, 1, __func__);
479112158Sdas			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
480112158Sdas		}
481112158Sdas
482112158Sdas		/* The error is meaningless for deferred responses */
483112158Sdas		vmr.vmr_result = 0;
484112158Sdas
485112158Sdas		if (proc_compose_imsg(ps, PROC_CONTROL, -1,
486112158Sdas			IMSG_VMDOP_TERMINATE_VM_EVENT,
487112158Sdas			imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
488112158Sdas			return (-1);
489112158Sdas		break;
490112158Sdas	case IMSG_VMDOP_GET_INFO_VM_DATA:
491112158Sdas		IMSG_SIZE_CHECK(imsg, &vir);
492112158Sdas		memcpy(&vir, imsg->data, sizeof(vir));
493112158Sdas		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
494112158Sdas			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
495112158Sdas			if (vm->vm_ttyname != NULL)
496112158Sdas				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
497112158Sdas				    sizeof(vir.vir_ttyname));
498112158Sdas			log_debug("%s: running vm: %d, vm_state: 0x%x",
499112158Sdas			    __func__, vm->vm_vmid, vm->vm_state);
500112158Sdas			vir.vir_state = vm->vm_state;
501112158Sdas			/* get the user id who started the vm */
502112158Sdas			vir.vir_uid = vm->vm_uid;
503112158Sdas			vir.vir_gid = vm->vm_params.vmc_owner.gid;
504112158Sdas		}
505112158Sdas		if (proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type,
506112158Sdas		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
507112158Sdas			log_debug("%s: GET_INFO_VM failed for vm %d, removing",
508112158Sdas			    __func__, vm->vm_vmid);
509112158Sdas			vm_remove(vm, __func__);
510112158Sdas			return (-1);
511112158Sdas		}
512112158Sdas		break;
513112158Sdas	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
514112158Sdas		/*
515112158Sdas		 * PROC_VMM has responded with the *running* VMs, now we
516112158Sdas		 * append the others. These use the special value 0 for their
517112158Sdas		 * kernel id to indicate that they are not running.
518112158Sdas		 */
519112158Sdas		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
520112158Sdas			if (!(vm->vm_state & VM_STATE_RUNNING)) {
521112158Sdas				memset(&vir, 0, sizeof(vir));
522112158Sdas				vir.vir_info.vir_id = vm->vm_vmid;
523112158Sdas				strlcpy(vir.vir_info.vir_name,
524112158Sdas				    vm->vm_params.vmc_params.vcp_name,
525112158Sdas				    VMM_MAX_NAME_LEN);
526112158Sdas				vir.vir_info.vir_memory_size =
527112158Sdas				    vm->vm_params.vmc_params.
528112158Sdas				    vcp_memranges[0].vmr_size;
529112158Sdas				vir.vir_info.vir_ncpus =
530112158Sdas				    vm->vm_params.vmc_params.vcp_ncpus;
531112158Sdas				/* get the configured user id for this vm */
532112158Sdas				vir.vir_uid = vm->vm_params.vmc_owner.uid;
533112158Sdas				vir.vir_gid = vm->vm_params.vmc_owner.gid;
534112158Sdas				log_debug("%s: vm: %d, vm_state: 0x%x",
535112158Sdas				    __func__, vm->vm_vmid, vm->vm_state);
536112158Sdas				vir.vir_state = vm->vm_state;
537112158Sdas				if (proc_compose_imsg(ps, PROC_CONTROL, -1,
538112158Sdas				    IMSG_VMDOP_GET_INFO_VM_DATA,
539112158Sdas				    imsg->hdr.peerid, -1, &vir,
540182709Sdas				    sizeof(vir)) == -1) {
541112158Sdas					log_debug("%s: GET_INFO_VM_END failed",
542112158Sdas					    __func__);
543219557Sdas					vm_remove(vm, __func__);
544112158Sdas					return (-1);
545219557Sdas				}
546112158Sdas			}
547112158Sdas		}
548112158Sdas		IMSG_SIZE_CHECK(imsg, &res);
549112158Sdas		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
550112158Sdas		break;
551112158Sdas	default:
552112158Sdas		return (-1);
553112158Sdas	}
554112158Sdas
555112158Sdas	return (0);
556112158Sdas}
557112158Sdas
558112158Sdasint
559112158Sdasvmd_dispatch_priv(int fd, struct privsep_proc *p, struct imsg *imsg)
560112158Sdas{
561112158Sdas	struct vmop_addr_result	 var;
562112158Sdas
563112158Sdas	switch (imsg->hdr.type) {
564112158Sdas	case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
565112158Sdas		IMSG_SIZE_CHECK(imsg, &var);
566112158Sdas		memcpy(&var, imsg->data, sizeof(var));
567112158Sdas		proc_forward_imsg(p->p_ps, imsg, PROC_VMM, -1);
568112158Sdas		break;
569112158Sdas	default:
570112158Sdas		return (-1);
571112158Sdas	}
572112158Sdas
573112158Sdas	return (0);
574112158Sdas}
575112158Sdas
576112158Sdasint
577112158Sdasvmd_check_vmh(struct vm_dump_header *vmh)
578112158Sdas{
579112158Sdas	int i;
580112158Sdas	unsigned int code, leaf;
581112158Sdas	unsigned int a, b, c, d;
582112158Sdas
583112158Sdas	if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
584112158Sdas		log_warnx("%s: incompatible dump signature", __func__);
585112158Sdas		return (-1);
586112158Sdas	}
587112158Sdas
588112158Sdas	if (vmh->vmh_version != VM_DUMP_VERSION) {
589112158Sdas		log_warnx("%s: incompatible dump version", __func__);
590112158Sdas		return (-1);
591112158Sdas	}
592112158Sdas
593112158Sdas	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
594112158Sdas		code = vmh->vmh_cpuids[i].code;
595112158Sdas		leaf = vmh->vmh_cpuids[i].leaf;
596112158Sdas		if (leaf != 0x00) {
597112158Sdas			log_debug("%s: invalid leaf 0x%x for code 0x%x",
598112158Sdas			    __func__, leaf, code);
599112158Sdas			return (-1);
600112158Sdas		}
601112158Sdas
602112158Sdas		switch (code) {
603112158Sdas		case 0x00:
604112158Sdas			CPUID_LEAF(code, leaf, a, b, c, d);
605112158Sdas			if (vmh->vmh_cpuids[i].a > a) {
606112158Sdas				log_debug("%s: incompatible cpuid level",
607112158Sdas				    __func__);
608112158Sdas				return (-1);
609112158Sdas			}
610112158Sdas			if (!(vmh->vmh_cpuids[i].b == b &&
611112158Sdas			    vmh->vmh_cpuids[i].c == c &&
612112158Sdas			    vmh->vmh_cpuids[i].d == d)) {
613112158Sdas				log_debug("%s: incompatible cpu brand",
614112158Sdas				    __func__);
615112158Sdas				return (-1);
616112158Sdas			}
617112158Sdas			break;
618112158Sdas
619112158Sdas		case 0x01:
620112158Sdas			CPUID_LEAF(code, leaf, a, b, c, d);
621112158Sdas			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
622112158Sdas			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
623112158Sdas				log_debug("%s: incompatible cpu features "
624112158Sdas				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
625112158Sdas				    code, leaf);
626112158Sdas				return (-1);
627112158Sdas			}
628112158Sdas			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
629112158Sdas			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
630112158Sdas				log_debug("%s: incompatible cpu features "
631219557Sdas				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
632112158Sdas				    code, leaf);
633182709Sdas				return (-1);
634112158Sdas			}
635112158Sdas			break;
636112158Sdas
637112158Sdas		case 0x07:
638112158Sdas			CPUID_LEAF(code, leaf, a, b, c, d);
639112158Sdas			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
640112158Sdas			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
641112158Sdas				log_debug("%s: incompatible cpu features "
642112158Sdas				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
643112158Sdas				    code, leaf);
644112158Sdas				return (-1);
645112158Sdas			}
646112158Sdas			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
647112158Sdas			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
648219557Sdas				log_debug("%s: incompatible cpu features "
649112158Sdas				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
650219557Sdas				    code, leaf);
651112158Sdas				return (-1);
652219557Sdas			}
653112158Sdas			break;
654112158Sdas
655112158Sdas		case 0x0d:
656112158Sdas			CPUID_LEAF(code, leaf, a, b, c, d);
657112158Sdas			if (vmh->vmh_cpuids[i].b > b) {
658112158Sdas				log_debug("%s: incompatible cpu: insufficient "
659112158Sdas				    "max save area for enabled XCR0 features",
660112158Sdas				    __func__);
661182709Sdas				return (-1);
662112158Sdas			}
663112158Sdas			if (vmh->vmh_cpuids[i].c > c) {
664112158Sdas				log_debug("%s: incompatible cpu: insufficient "
665112158Sdas				    "max save area for supported XCR0 features",
666112158Sdas				    __func__);
667112158Sdas				return (-1);
668112158Sdas			}
669219557Sdas			break;
670219557Sdas
671219557Sdas		case 0x80000001:
672219557Sdas			CPUID_LEAF(code, leaf, a, b, c, d);
673219557Sdas			if ((vmh->vmh_cpuids[i].a & a) !=
674112158Sdas			    vmh->vmh_cpuids[i].a) {
675112158Sdas				log_debug("%s: incompatible cpu features "
676112158Sdas				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
677112158Sdas				    code, leaf);
678112158Sdas				return (-1);
679112158Sdas			}
680112158Sdas			if ((vmh->vmh_cpuids[i].c & c) !=
681112158Sdas			    vmh->vmh_cpuids[i].c) {
682112158Sdas				log_debug("%s: incompatible cpu features "
683182709Sdas				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
684112158Sdas				    code, leaf);
685112158Sdas				return (-1);
686112158Sdas			}
687112158Sdas			if ((vmh->vmh_cpuids[i].d & d) !=
688112158Sdas			    vmh->vmh_cpuids[i].d) {
689112158Sdas				log_debug("%s: incompatible cpu features "
690112158Sdas				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
691112158Sdas				    code, leaf);
692112158Sdas				return (-1);
693112158Sdas			}
694112158Sdas			break;
695112158Sdas
696112158Sdas		default:
697112158Sdas			log_debug("%s: unknown code 0x%x", __func__, code);
698112158Sdas			return (-1);
699112158Sdas		}
700112158Sdas	}
701112158Sdas
702112158Sdas	return (0);
703112158Sdas}
704112158Sdas
705112158Sdasvoid
706112158Sdasvmd_sighdlr(int sig, short event, void *arg)
707112158Sdas{
708112158Sdas	if (privsep_process != PROC_PARENT)
709112158Sdas		return;
710112158Sdas	log_debug("%s: handling signal", __func__);
711112158Sdas
712112158Sdas	switch (sig) {
713112158Sdas	case SIGHUP:
714112158Sdas		log_info("%s: reload requested with SIGHUP", __func__);
715112158Sdas
716112158Sdas		/*
717112158Sdas		 * This is safe because libevent uses async signal handlers
718112158Sdas		 * that run in the event loop and not in signal context.
719112158Sdas		 */
720112158Sdas		(void)vmd_reload(0, NULL);
721112158Sdas		break;
722112158Sdas	case SIGPIPE:
723112158Sdas		log_info("%s: ignoring SIGPIPE", __func__);
724112158Sdas		break;
725112158Sdas	case SIGUSR1:
726182709Sdas		log_info("%s: ignoring SIGUSR1", __func__);
727112158Sdas		break;
728112158Sdas	case SIGTERM:
729112158Sdas	case SIGINT:
730112158Sdas		vmd_shutdown();
731112158Sdas		break;
732112158Sdas	default:
733219557Sdas		fatalx("unexpected signal");
734219557Sdas	}
735219557Sdas}
736219557Sdas
737219557Sdas__dead void
738219557Sdasusage(void)
739112158Sdas{
740112158Sdas	extern char *__progname;
741112158Sdas	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
742112158Sdas	    __progname);
743112158Sdas	exit(1);
744112158Sdas}
745112158Sdas
746112158Sdasint
747112158Sdasmain(int argc, char **argv)
748112158Sdas{
749219557Sdas	struct privsep		*ps;
750112158Sdas	int			 ch;
751219557Sdas	const char		*conffile = VMD_CONF;
752112158Sdas	enum privsep_procid	 proc_id = PROC_PARENT;
753112158Sdas	int			 proc_instance = 0;
754112158Sdas	const char		*errp, *title = NULL;
755112158Sdas	int			 argc0 = argc;
756112158Sdas
757112158Sdas	log_init(0, LOG_DAEMON);
758112158Sdas
759112158Sdas	if ((env = calloc(1, sizeof(*env))) == NULL)
760112158Sdas		fatal("calloc: env");
761112158Sdas
762112158Sdas	while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) {
763112158Sdas		switch (ch) {
764112158Sdas		case 'D':
765112158Sdas			if (cmdline_symset(optarg) < 0)
766219557Sdas				log_warnx("could not parse macro definition %s",
767219557Sdas				    optarg);
768219557Sdas			break;
769112158Sdas		case 'd':
770112158Sdas			env->vmd_debug = 2;
771112158Sdas			break;
772112158Sdas		case 'f':
773112158Sdas			conffile = optarg;
774112158Sdas			break;
775112158Sdas		case 'v':
776112158Sdas			env->vmd_verbose++;
777112158Sdas			break;
778112158Sdas		case 'n':
779112158Sdas			env->vmd_noaction = 1;
780112158Sdas			break;
781		case 'P':
782			title = optarg;
783			proc_id = proc_getid(procs, nitems(procs), title);
784			if (proc_id == PROC_MAX)
785				fatalx("invalid process name");
786			break;
787		case 'I':
788			proc_instance = strtonum(optarg, 0,
789			    PROC_MAX_INSTANCES, &errp);
790			if (errp)
791				fatalx("invalid process instance");
792			break;
793		default:
794			usage();
795		}
796	}
797
798	argc -= optind;
799	if (argc > 0)
800		usage();
801
802	if (env->vmd_noaction && !env->vmd_debug)
803		env->vmd_debug = 1;
804
805	log_init(env->vmd_debug, LOG_DAEMON);
806	log_setverbose(env->vmd_verbose);
807
808	/* check for root privileges */
809	if (env->vmd_noaction == 0) {
810		if (geteuid())
811			fatalx("need root privileges");
812	}
813
814	ps = &env->vmd_ps;
815	ps->ps_env = env;
816	env->vmd_fd = -1;
817
818	if (config_init(env) == -1)
819		fatal("failed to initialize configuration");
820
821	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
822		fatal("unknown user %s", VMD_USER);
823
824	/* First proc runs as root without pledge but in default chroot */
825	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
826	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
827
828	/* Open /dev/vmm */
829	if (env->vmd_noaction == 0) {
830		env->vmd_fd = open(VMM_NODE, O_RDWR);
831		if (env->vmd_fd == -1)
832			fatal("%s", VMM_NODE);
833	}
834
835	/* Configure the control socket */
836	ps->ps_csock.cs_name = SOCKET_NAME;
837	TAILQ_INIT(&ps->ps_rcsocks);
838
839	/* Configuration will be parsed after forking the children */
840	env->vmd_conffile = conffile;
841
842	if (env->vmd_noaction)
843		ps->ps_noaction = 1;
844	ps->ps_instance = proc_instance;
845	if (title != NULL)
846		ps->ps_title[proc_id] = title;
847
848	/* only the parent returns */
849	proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv,
850	    proc_id);
851
852	log_procinit("parent");
853	if (!env->vmd_debug && daemon(0, 0) == -1)
854		fatal("can't daemonize");
855
856	if (ps->ps_noaction == 0)
857		log_info("startup");
858
859	event_init();
860
861	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
862	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
863	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
864	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
865	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
866
867	signal_add(&ps->ps_evsigint, NULL);
868	signal_add(&ps->ps_evsigterm, NULL);
869	signal_add(&ps->ps_evsighup, NULL);
870	signal_add(&ps->ps_evsigpipe, NULL);
871	signal_add(&ps->ps_evsigusr1, NULL);
872
873	if (!env->vmd_noaction)
874		proc_connect(ps);
875
876	if (vmd_configure() == -1)
877		fatalx("configuration failed");
878
879	event_dispatch();
880
881	log_debug("parent exiting");
882
883	return (0);
884}
885
886void
887start_vm_batch(int fd, short type, void *args)
888{
889	int		i = 0;
890	struct vmd_vm	*vm;
891
892	log_debug("%s: starting batch of %d vms", __func__,
893	    env->vmd_cfg.parallelism);
894	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
895		if (!(vm->vm_state & VM_STATE_WAITING)) {
896			log_debug("%s: not starting vm %s (disabled)",
897			    __func__,
898			    vm->vm_params.vmc_params.vcp_name);
899			continue;
900		}
901		i++;
902		if (i > env->vmd_cfg.parallelism) {
903			evtimer_add(&staggered_start_timer,
904			    &env->vmd_cfg.delay);
905			break;
906		}
907		vm->vm_state &= ~VM_STATE_WAITING;
908		config_setvm(&env->vmd_ps, vm, -1, vm->vm_params.vmc_owner.uid);
909	}
910	log_debug("%s: done starting vms", __func__);
911}
912
913int
914vmd_configure(void)
915{
916	int			ncpus;
917	struct vmd_switch	*vsw;
918	int ncpu_mib[] = {CTL_HW, HW_NCPUONLINE};
919	size_t ncpus_sz = sizeof(ncpus);
920
921	if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
922		fatal("open %s", PATH_PTMDEV);
923
924	/*
925	 * pledge in the parent process:
926	 * stdio - for malloc and basic I/O including events.
927	 * rpath - for reload to open and read the configuration files.
928	 * wpath - for opening disk images and tap devices.
929	 * tty - for openpty and TIOCUCNTL.
930	 * proc - run kill to terminate its children safely.
931	 * sendfd - for disks, interfaces and other fds.
932	 * recvfd - for send and receive.
933	 * getpw - lookup user or group id by name.
934	 * chown, fattr - change tty ownership
935	 * flock - locking disk files
936	 */
937	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
938	    " chown fattr flock", NULL) == -1)
939		fatal("pledge");
940
941	if (parse_config(env->vmd_conffile) == -1) {
942		proc_kill(&env->vmd_ps);
943		exit(1);
944	}
945
946	if (env->vmd_noaction) {
947		fprintf(stderr, "configuration OK\n");
948		proc_kill(&env->vmd_ps);
949		exit(0);
950	}
951
952	/* Send shared global configuration to all children */
953	if (config_setconfig(env) == -1)
954		return (-1);
955
956	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
957		if (vsw->sw_running)
958			continue;
959		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
960			log_warn("%s: failed to create switch %s",
961			    __func__, vsw->sw_name);
962			switch_remove(vsw);
963			return (-1);
964		}
965	}
966
967	if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) {
968		env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY;
969		if (sysctl(ncpu_mib, nitems(ncpu_mib), &ncpus, &ncpus_sz, NULL, 0) == -1)
970			ncpus = 1;
971		env->vmd_cfg.parallelism = ncpus;
972		log_debug("%s: setting staggered start configuration to "
973		    "parallelism: %d and delay: %lld",
974		    __func__, ncpus, (long long) env->vmd_cfg.delay.tv_sec);
975	}
976
977	log_debug("%s: starting vms in staggered fashion", __func__);
978	evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
979	/* start first batch */
980	start_vm_batch(0, 0, NULL);
981
982	return (0);
983}
984
985int
986vmd_reload(unsigned int reset, const char *filename)
987{
988	struct vmd_vm		*vm, *next_vm;
989	struct vmd_switch	*vsw;
990	int			 reload = 0;
991
992	/* Switch back to the default config file */
993	if (filename == NULL || *filename == '\0') {
994		filename = env->vmd_conffile;
995		reload = 1;
996	}
997
998	log_debug("%s: level %d config file %s", __func__, reset, filename);
999
1000	if (reset) {
1001		/* Purge the configuration */
1002		config_purge(env, reset);
1003		config_setreset(env, reset);
1004	} else {
1005		/*
1006		 * Load or reload the configuration.
1007		 *
1008		 * Reloading removes all non-running VMs before processing the
1009		 * config file, whereas loading only adds to the existing list
1010		 * of VMs.
1011		 */
1012
1013		if (reload) {
1014			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
1015			    next_vm) {
1016				if (!(vm->vm_state & VM_STATE_RUNNING)) {
1017					DPRINTF("%s: calling vm_remove",
1018					    __func__);
1019					vm_remove(vm, __func__);
1020				}
1021			}
1022		}
1023
1024		if (parse_config(filename) == -1) {
1025			log_debug("%s: failed to load config file %s",
1026			    __func__, filename);
1027			return (-1);
1028		}
1029
1030		if (reload) {
1031			/* Update shared global configuration in all children */
1032			if (config_setconfig(env) == -1)
1033				return (-1);
1034		}
1035
1036		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1037			if (vsw->sw_running)
1038				continue;
1039			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1040				log_warn("%s: failed to create switch %s",
1041				    __func__, vsw->sw_name);
1042				switch_remove(vsw);
1043				return (-1);
1044			}
1045		}
1046
1047		log_debug("%s: starting vms in staggered fashion", __func__);
1048		evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1049		/* start first batch */
1050		start_vm_batch(0, 0, NULL);
1051
1052		}
1053
1054	return (0);
1055}
1056
1057void
1058vmd_shutdown(void)
1059{
1060	struct vmd_vm *vm, *vm_next;
1061
1062	log_debug("%s: performing shutdown", __func__);
1063
1064	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
1065		vm_remove(vm, __func__);
1066	}
1067
1068	proc_kill(&env->vmd_ps);
1069	free(env);
1070
1071	log_warnx("parent terminating");
1072	exit(0);
1073}
1074
1075struct vmd_vm *
1076vm_getbyvmid(uint32_t vmid)
1077{
1078	struct vmd_vm	*vm;
1079
1080	if (vmid == 0)
1081		return (NULL);
1082	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1083		if (vm->vm_vmid == vmid)
1084			return (vm);
1085	}
1086
1087	return (NULL);
1088}
1089
1090struct vmd_vm *
1091vm_getbyid(uint32_t id)
1092{
1093	struct vmd_vm	*vm;
1094
1095	if (id == 0)
1096		return (NULL);
1097	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1098		if (vm->vm_params.vmc_params.vcp_id == id)
1099			return (vm);
1100	}
1101
1102	return (NULL);
1103}
1104
1105uint32_t
1106vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1107{
1108	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1109		return (0);
1110	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1111	    id, vm->vm_vmid);
1112	return (vm->vm_vmid);
1113}
1114
1115uint32_t
1116vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1117{
1118	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1119		return (0);
1120	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1121	    vmid, vm->vm_params.vmc_params.vcp_id);
1122	return (vm->vm_params.vmc_params.vcp_id);
1123}
1124
1125struct vmd_vm *
1126vm_getbyname(const char *name)
1127{
1128	struct vmd_vm	*vm;
1129
1130	if (name == NULL)
1131		return (NULL);
1132	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1133		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1134			return (vm);
1135	}
1136
1137	return (NULL);
1138}
1139
1140struct vmd_vm *
1141vm_getbypid(pid_t pid)
1142{
1143	struct vmd_vm	*vm;
1144
1145	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1146		if (vm->vm_pid == pid)
1147			return (vm);
1148	}
1149
1150	return (NULL);
1151}
1152
1153void
1154vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1155{
1156	struct privsep	*ps = &env->vmd_ps;
1157	unsigned int	 i, j;
1158
1159	if (vm == NULL)
1160		return;
1161
1162	log_debug("%s: %s %s stopping vm %d%s",
1163	    __func__, ps->ps_title[privsep_process], caller,
1164	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1165
1166	vm->vm_state &= ~(VM_STATE_RUNNING | VM_STATE_SHUTDOWN);
1167
1168	user_inc(&vm->vm_params.vmc_params, vm->vm_user, 0);
1169	user_put(vm->vm_user);
1170
1171	if (vm->vm_iev.ibuf.fd != -1) {
1172		event_del(&vm->vm_iev.ev);
1173		close(vm->vm_iev.ibuf.fd);
1174	}
1175	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
1176		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
1177			if (vm->vm_disks[i][j] != -1) {
1178				close(vm->vm_disks[i][j]);
1179				vm->vm_disks[i][j] = -1;
1180			}
1181		}
1182	}
1183	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
1184		if (vm->vm_ifs[i].vif_fd != -1) {
1185			close(vm->vm_ifs[i].vif_fd);
1186			vm->vm_ifs[i].vif_fd = -1;
1187		}
1188		free(vm->vm_ifs[i].vif_name);
1189		free(vm->vm_ifs[i].vif_switch);
1190		free(vm->vm_ifs[i].vif_group);
1191		vm->vm_ifs[i].vif_name = NULL;
1192		vm->vm_ifs[i].vif_switch = NULL;
1193		vm->vm_ifs[i].vif_group = NULL;
1194	}
1195	if (vm->vm_kernel != -1) {
1196		close(vm->vm_kernel);
1197		vm->vm_kernel = -1;
1198	}
1199	if (vm->vm_cdrom != -1) {
1200		close(vm->vm_cdrom);
1201		vm->vm_cdrom = -1;
1202	}
1203	if (!keeptty) {
1204		vm_closetty(vm);
1205		vm->vm_uid = 0;
1206	}
1207}
1208
1209void
1210vm_remove(struct vmd_vm *vm, const char *caller)
1211{
1212	struct privsep	*ps = &env->vmd_ps;
1213
1214	if (vm == NULL)
1215		return;
1216
1217	log_debug("%s: %s %s removing vm %d from running config",
1218	    __func__, ps->ps_title[privsep_process], caller,
1219	    vm->vm_vmid);
1220
1221	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1222
1223	user_put(vm->vm_user);
1224	vm_stop(vm, 0, caller);
1225	free(vm);
1226}
1227
1228int
1229vm_claimid(const char *name, int uid, uint32_t *id)
1230{
1231	struct name2id *n2i = NULL;
1232
1233	TAILQ_FOREACH(n2i, env->vmd_known, entry)
1234		if (strcmp(n2i->name, name) == 0 && n2i->uid == uid)
1235			goto out;
1236
1237	if (++env->vmd_nvm == 0) {
1238		log_warnx("too many vms");
1239		return -1;
1240	}
1241	if ((n2i = calloc(1, sizeof(struct name2id))) == NULL) {
1242		log_warnx("could not alloc vm name");
1243		return -1;
1244	}
1245	n2i->id = env->vmd_nvm;
1246	n2i->uid = uid;
1247	if (strlcpy(n2i->name, name, sizeof(n2i->name)) >= sizeof(n2i->name)) {
1248		log_warnx("vm name too long");
1249		free(n2i);
1250		return -1;
1251	}
1252	TAILQ_INSERT_TAIL(env->vmd_known, n2i, entry);
1253
1254out:
1255	*id = n2i->id;
1256	return 0;
1257}
1258
1259int
1260vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1261    struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1262{
1263	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1264	struct vm_create_params	*vcp = &vmc->vmc_params;
1265	struct vmop_owner	*vmo = NULL;
1266	struct vmd_user		*usr = NULL;
1267	uint32_t		 nid, rng;
1268	unsigned int		 i, j;
1269	struct vmd_switch	*sw;
1270	char			*s;
1271	int			 ret = 0;
1272
1273	/* Check if this is an instance of another VM */
1274	if ((ret = vm_instance(ps, &vm_parent, vmc, uid)) != 0) {
1275		errno = ret; /* XXX might set invalid errno */
1276		return (-1);
1277	}
1278
1279	errno = 0;
1280	*ret_vm = NULL;
1281
1282	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1283	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1284		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1285		    uid) != 0) {
1286			errno = EPERM;
1287			goto fail;
1288		}
1289		*ret_vm = vm;
1290		errno = EALREADY;
1291		goto fail;
1292	}
1293
1294	if (vm_parent != NULL)
1295		vmo = &vm_parent->vm_params.vmc_insowner;
1296
1297	/* non-root users can only start existing VMs or instances */
1298	if (vm_checkperm(NULL, vmo, uid) != 0) {
1299		log_warnx("permission denied");
1300		errno = EPERM;
1301		goto fail;
1302	}
1303	if (vmc->vmc_flags == 0) {
1304		log_warnx("invalid configuration, no devices");
1305		errno = VMD_DISK_MISSING;
1306		goto fail;
1307	}
1308	if (vcp->vcp_ncpus == 0)
1309		vcp->vcp_ncpus = 1;
1310	if (vcp->vcp_memranges[0].vmr_size == 0)
1311		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1312	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1313		log_warnx("invalid number of CPUs");
1314		goto fail;
1315	} else if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) {
1316		log_warnx("invalid number of disks");
1317		goto fail;
1318	} else if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) {
1319		log_warnx("invalid number of interfaces");
1320		goto fail;
1321	} else if (strlen(vcp->vcp_kernel) == 0 &&
1322	    vcp->vcp_ndisks == 0 && strlen(vcp->vcp_cdrom) == 0) {
1323		log_warnx("no kernel or disk/cdrom specified");
1324		goto fail;
1325	} else if (strlen(vcp->vcp_name) == 0) {
1326		log_warnx("invalid VM name");
1327		goto fail;
1328	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1329	    *vcp->vcp_name == '_') {
1330		log_warnx("invalid VM name");
1331		goto fail;
1332	} else {
1333		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1334			if (!(isalnum(*s) || *s == '.' || *s == '-' ||
1335			    *s == '_')) {
1336				log_warnx("invalid VM name");
1337				goto fail;
1338			}
1339		}
1340	}
1341
1342	/* track active users */
1343	if (uid != 0 && env->vmd_users != NULL &&
1344	    (usr = user_get(uid)) == NULL) {
1345		log_warnx("could not add user");
1346		goto fail;
1347	}
1348
1349	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1350		goto fail;
1351
1352	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1353	vmc = &vm->vm_params;
1354	vcp = &vmc->vmc_params;
1355	vm->vm_pid = -1;
1356	vm->vm_tty = -1;
1357	vm->vm_receive_fd = -1;
1358	vm->vm_state &= ~VM_STATE_PAUSED;
1359	vm->vm_user = usr;
1360
1361	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
1362		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
1363			vm->vm_disks[i][j] = -1;
1364	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
1365		vm->vm_ifs[i].vif_fd = -1;
1366	for (i = 0; i < vcp->vcp_nnics; i++) {
1367		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1368			/* inherit per-interface flags from the switch */
1369			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1370		}
1371
1372		/*
1373		 * If the MAC address is zero, always randomize it in vmd(8)
1374		 * because we cannot rely on the guest OS to do the right
1375		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1376		 * from the kernel, incremented by one to differentiate
1377		 * the source.
1378		 */
1379		if (memcmp(zero_mac, &vcp->vcp_macs[i], ETHER_ADDR_LEN) == 0) {
1380			rng = arc4random();
1381			vcp->vcp_macs[i][0] = 0xfe;
1382			vcp->vcp_macs[i][1] = 0xe1;
1383			vcp->vcp_macs[i][2] = 0xba + 1;
1384			vcp->vcp_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1385			vcp->vcp_macs[i][4] = rng;
1386			vcp->vcp_macs[i][5] = rng >> 8;
1387		}
1388	}
1389	vm->vm_kernel = -1;
1390	vm->vm_cdrom = -1;
1391	vm->vm_iev.ibuf.fd = -1;
1392
1393	/*
1394	 * Assign a new internal Id if not specified and we succeed in
1395	 * claiming a new Id.
1396	 */
1397	if (id != 0)
1398		vm->vm_vmid = id;
1399	else if (vm_claimid(vcp->vcp_name, uid, &nid) == -1)
1400		goto fail;
1401	else
1402		vm->vm_vmid = nid;
1403
1404	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1405	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1406
1407	*ret_vm = vm;
1408	return (0);
1409 fail:
1410	if (errno == 0)
1411		errno = EINVAL;
1412	return (-1);
1413}
1414
1415int
1416vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1417    struct vmop_create_params *vmc, uid_t uid)
1418{
1419	char			*name;
1420	struct vm_create_params	*vcp = &vmc->vmc_params;
1421	struct vmop_create_params *vmcp;
1422	struct vm_create_params	*vcpp;
1423	struct vmd_vm		*vm = NULL;
1424	unsigned int		 i, j;
1425	uint32_t		 id;
1426
1427	/* return without error if the parent is NULL (nothing to inherit) */
1428	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1429	    vmc->vmc_instance[0] == '\0')
1430		return (0);
1431
1432	if ((*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL) {
1433		return (VMD_PARENT_INVALID);
1434	}
1435
1436	vmcp = &(*vm_parent)->vm_params;
1437	vcpp = &vmcp->vmc_params;
1438
1439	/* Are we allowed to create an instance from this VM? */
1440	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1441		log_warnx("vm \"%s\" no permission to create vm instance",
1442		    vcpp->vcp_name);
1443		return (ENAMETOOLONG);
1444	}
1445
1446	id = vcp->vcp_id;
1447	name = vcp->vcp_name;
1448
1449	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1450	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1451		return (EPROCLIM);
1452	}
1453
1454	/* CPU */
1455	if (vcp->vcp_ncpus == 0)
1456		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1457	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1458	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1459		log_warnx("vm \"%s\" no permission to set cpus", name);
1460		return (EPERM);
1461	}
1462
1463	/* memory */
1464	if (vcp->vcp_memranges[0].vmr_size == 0)
1465		vcp->vcp_memranges[0].vmr_size =
1466		    vcpp->vcp_memranges[0].vmr_size;
1467	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1468	    vcp->vcp_memranges[0].vmr_size !=
1469	    vcpp->vcp_memranges[0].vmr_size) {
1470		log_warnx("vm \"%s\" no permission to set memory", name);
1471		return (EPERM);
1472	}
1473
1474	/* disks cannot be inherited */
1475	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1476	    vcp->vcp_ndisks) {
1477		log_warnx("vm \"%s\" no permission to set disks", name);
1478		return (EPERM);
1479	}
1480	for (i = 0; i < vcp->vcp_ndisks; i++) {
1481		/* Check if this disk is already used in the parent */
1482		for (j = 0; j < vcpp->vcp_ndisks; j++) {
1483			if (strcmp(vcp->vcp_disks[i],
1484			    vcpp->vcp_disks[j]) == 0) {
1485				log_warnx("vm \"%s\" disk %s cannot be reused",
1486				    name, vcp->vcp_disks[i]);
1487				return (EBUSY);
1488			}
1489		}
1490		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1491	}
1492
1493	/* interfaces */
1494	if (vcp->vcp_nnics > 0 &&
1495	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1496	    vcp->vcp_nnics != vcpp->vcp_nnics) {
1497		log_warnx("vm \"%s\" no permission to set interfaces", name);
1498		return (EPERM);
1499	}
1500	for (i = 0; i < vcpp->vcp_nnics; i++) {
1501		/* Interface got overwritten */
1502		if (i < vcp->vcp_nnics)
1503			continue;
1504
1505		/* Copy interface from parent */
1506		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1507		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1508		    sizeof(vmc->vmc_ifnames[i]));
1509		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1510		    sizeof(vmc->vmc_ifswitch[i]));
1511		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1512		    sizeof(vmc->vmc_ifgroup[i]));
1513		memcpy(vcp->vcp_macs[i], vcpp->vcp_macs[i],
1514		    sizeof(vcp->vcp_macs[i]));
1515		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1516		vcp->vcp_nnics++;
1517	}
1518	for (i = 0; i < vcp->vcp_nnics; i++) {
1519		for (j = 0; j < vcpp->vcp_nnics; j++) {
1520			if (memcmp(zero_mac, vcp->vcp_macs[i],
1521			    sizeof(vcp->vcp_macs[i])) != 0 &&
1522			    memcmp(vcpp->vcp_macs[i], vcp->vcp_macs[i],
1523			    sizeof(vcp->vcp_macs[i])) != 0) {
1524				log_warnx("vm \"%s\" lladdr cannot be reused",
1525				    name);
1526				return (EBUSY);
1527			}
1528			if (strlen(vmc->vmc_ifnames[i]) &&
1529			    strcmp(vmc->vmc_ifnames[i],
1530			    vmcp->vmc_ifnames[j]) == 0) {
1531				log_warnx("vm \"%s\" %s cannot be reused",
1532				    vmc->vmc_ifnames[i], name);
1533				return (EBUSY);
1534			}
1535		}
1536	}
1537
1538	/* kernel */
1539	if (strlen(vcp->vcp_kernel) > 0) {
1540		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1541			log_warnx("vm \"%s\" no permission to set boot image",
1542			    name);
1543			return (EPERM);
1544		}
1545		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1546	} else if (strlcpy(vcp->vcp_kernel, vcpp->vcp_kernel,
1547	    sizeof(vcp->vcp_kernel)) >= sizeof(vcp->vcp_kernel)) {
1548		log_warnx("vm \"%s\" kernel name too long", name);
1549		return (EINVAL);
1550	}
1551
1552	/* cdrom */
1553	if (strlen(vcp->vcp_cdrom) > 0) {
1554		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1555			log_warnx("vm \"%s\" no permission to set cdrom", name);
1556			return (EPERM);
1557		}
1558		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1559	} else if (strlcpy(vcp->vcp_cdrom, vcpp->vcp_cdrom,
1560	    sizeof(vcp->vcp_cdrom)) >= sizeof(vcp->vcp_cdrom)) {
1561		log_warnx("vm \"%s\" cdrom name too long", name);
1562		return (EINVAL);
1563	}
1564
1565	/* user */
1566	if (vmc->vmc_owner.uid == 0)
1567		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1568	else if (vmc->vmc_owner.uid != uid &&
1569	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1570		log_warnx("vm \"%s\" user mismatch", name);
1571		return (EPERM);
1572	}
1573
1574	/* group */
1575	if (vmc->vmc_owner.gid == 0)
1576		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1577	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1578		log_warnx("vm \"%s\" group mismatch", name);
1579		return (EPERM);
1580	}
1581
1582	/* child instances */
1583	if (vmc->vmc_insflags) {
1584		log_warnx("vm \"%s\" cannot change instance permissions", name);
1585		return (EPERM);
1586	}
1587	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1588		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1589		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1590		vmc->vmc_insflags = vmcp->vmc_insflags;
1591	} else {
1592		vmc->vmc_insowner.gid = 0;
1593		vmc->vmc_insowner.uid = 0;
1594		vmc->vmc_insflags = 0;
1595	}
1596
1597	/* finished, remove instance flags */
1598	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1599
1600	return (0);
1601}
1602
1603/*
1604 * vm_checkperm
1605 *
1606 * Checks if the user represented by the 'uid' parameter is allowed to
1607 * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1608 * console.)
1609 *
1610 * Parameters:
1611 *  vm: the VM whose permission is to be checked
1612 *  vmo: the required uid/gid to be checked
1613 *  uid: the user ID of the user making the request
1614 *
1615 * Return values:
1616 *   0: the permission should be granted
1617 *  -1: the permission check failed (also returned if vm == null)
1618 */
1619int
1620vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1621{
1622	struct group	*gr;
1623	struct passwd	*pw;
1624	char		**grmem;
1625
1626	/* root has no restrictions */
1627	if (uid == 0)
1628		return (0);
1629
1630	if (vmo == NULL)
1631		return (-1);
1632
1633	/* check user */
1634	if (vm == NULL) {
1635		if  (vmo->uid == uid)
1636			return (0);
1637	} else {
1638		/*
1639		 * check user of running vm (the owner of a running vm can
1640		 * be different to (or more specific than) the configured owner.
1641		 */
1642		if (((vm->vm_state & VM_STATE_RUNNING) && vm->vm_uid == uid) ||
1643		    (!(vm->vm_state & VM_STATE_RUNNING) && vmo->uid == uid))
1644			return (0);
1645	}
1646
1647	/* check groups */
1648	if (vmo->gid != -1) {
1649		if ((pw = getpwuid(uid)) == NULL)
1650			return (-1);
1651		if (pw->pw_gid == vmo->gid)
1652			return (0);
1653		if ((gr = getgrgid(vmo->gid)) != NULL) {
1654			for (grmem = gr->gr_mem; *grmem; grmem++)
1655				if (strcmp(*grmem, pw->pw_name) == 0)
1656					return (0);
1657		}
1658	}
1659
1660	return (-1);
1661}
1662
1663/*
1664 * vm_checkinsflag
1665 *
1666 * Checks wheter the non-root user is allowed to set an instance option.
1667 *
1668 * Parameters:
1669 *  vmc: the VM create parameters
1670 *  flag: the flag to be checked
1671 *  uid: the user ID of the user making the request
1672 *
1673 * Return values:
1674 *   0: the permission should be granted
1675 *  -1: the permission check failed (also returned if vm == null)
1676 */
1677int
1678vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1679{
1680	/* root has no restrictions */
1681	if (uid == 0)
1682		return (0);
1683
1684	if ((vmc->vmc_insflags & flag) == 0)
1685		return (-1);
1686
1687	return (0);
1688}
1689
1690/*
1691 * vm_checkaccess
1692 *
1693 * Checks if the user represented by the 'uid' parameter is allowed to
1694 * access the file described by the 'path' parameter.
1695 *
1696 * Parameters:
1697 *  fd: the file descriptor of the opened file
1698 *  uflag: check if the userid has access to the file
1699 *  uid: the user ID of the user making the request
1700 *  amode: the access flags of R_OK and W_OK
1701 *
1702 * Return values:
1703 *   0: the permission should be granted
1704 *  -1: the permission check failed
1705 */
1706int
1707vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1708{
1709	struct group	*gr;
1710	struct passwd	*pw;
1711	char		**grmem;
1712	struct stat	 st;
1713	mode_t		 mode;
1714
1715	if (fd == -1)
1716		return (-1);
1717
1718	/*
1719	 * File has to be accessible and a regular file
1720	 */
1721	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1722		return (-1);
1723
1724	/* root has no restrictions */
1725	if (uid == 0 || uflag == 0)
1726		return (0);
1727
1728	/* check other */
1729	mode = amode & W_OK ? S_IWOTH : 0;
1730	mode |= amode & R_OK ? S_IROTH : 0;
1731	if ((st.st_mode & mode) == mode)
1732		return (0);
1733
1734	/* check user */
1735	mode = amode & W_OK ? S_IWUSR : 0;
1736	mode |= amode & R_OK ? S_IRUSR : 0;
1737	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1738		return (0);
1739
1740	/* check groups */
1741	mode = amode & W_OK ? S_IWGRP : 0;
1742	mode |= amode & R_OK ? S_IRGRP : 0;
1743	if ((st.st_mode & mode) != mode)
1744		return (-1);
1745	if ((pw = getpwuid(uid)) == NULL)
1746		return (-1);
1747	if (pw->pw_gid == st.st_gid)
1748		return (0);
1749	if ((gr = getgrgid(st.st_gid)) != NULL) {
1750		for (grmem = gr->gr_mem; *grmem; grmem++)
1751			if (strcmp(*grmem, pw->pw_name) == 0)
1752				return (0);
1753	}
1754
1755	return (-1);
1756}
1757
1758int
1759vm_opentty(struct vmd_vm *vm)
1760{
1761	struct ptmget		 ptm;
1762	struct stat		 st;
1763	struct group		*gr;
1764	uid_t			 uid;
1765	gid_t			 gid;
1766	mode_t			 mode;
1767	int			 on;
1768
1769	/*
1770	 * Open tty with pre-opened PTM fd
1771	 */
1772	if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1))
1773		return (-1);
1774
1775	/*
1776	 * We use user ioctl(2) mode to pass break commands.
1777	 */
1778	on = 1;
1779	if (ioctl(ptm.cfd, TIOCUCNTL, &on) == -1)
1780		fatal("could not enable user ioctl mode");
1781
1782	vm->vm_tty = ptm.cfd;
1783	close(ptm.sfd);
1784	if ((vm->vm_ttyname = strdup(ptm.sn)) == NULL)
1785		goto fail;
1786
1787	uid = vm->vm_uid;
1788	gid = vm->vm_params.vmc_owner.gid;
1789
1790	if (vm->vm_params.vmc_owner.gid != -1) {
1791		mode = 0660;
1792	} else if ((gr = getgrnam("tty")) != NULL) {
1793		gid = gr->gr_gid;
1794		mode = 0620;
1795	} else {
1796		mode = 0600;
1797		gid = 0;
1798	}
1799
1800	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1801	    __func__, vm->vm_params.vmc_params.vcp_name,
1802	    vm->vm_ttyname, uid, gid, mode);
1803
1804	/*
1805	 * Change ownership and mode of the tty as required.
1806	 * Loosely based on the implementation of sshpty.c
1807	 */
1808	if (stat(vm->vm_ttyname, &st) == -1)
1809		goto fail;
1810
1811	if (st.st_uid != uid || st.st_gid != gid) {
1812		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1813			log_warn("chown %s %d %d failed, uid %d",
1814			    vm->vm_ttyname, uid, gid, getuid());
1815
1816			/* Ignore failure on read-only filesystems */
1817			if (!((errno == EROFS) &&
1818			    (st.st_uid == uid || st.st_uid == 0)))
1819				goto fail;
1820		}
1821	}
1822
1823	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1824		if (chmod(vm->vm_ttyname, mode) == -1) {
1825			log_warn("chmod %s %o failed, uid %d",
1826			    vm->vm_ttyname, mode, getuid());
1827
1828			/* Ignore failure on read-only filesystems */
1829			if (!((errno == EROFS) &&
1830			    (st.st_uid == uid || st.st_uid == 0)))
1831				goto fail;
1832		}
1833	}
1834
1835	return (0);
1836 fail:
1837	vm_closetty(vm);
1838	return (-1);
1839}
1840
1841void
1842vm_closetty(struct vmd_vm *vm)
1843{
1844	if (vm->vm_tty != -1) {
1845		/* Release and close the tty */
1846		if (fchown(vm->vm_tty, 0, 0) == -1)
1847			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1848		if (fchmod(vm->vm_tty, 0666) == -1)
1849			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1850		close(vm->vm_tty);
1851		vm->vm_tty = -1;
1852	}
1853	free(vm->vm_ttyname);
1854	vm->vm_ttyname = NULL;
1855}
1856
1857void
1858switch_remove(struct vmd_switch *vsw)
1859{
1860	if (vsw == NULL)
1861		return;
1862
1863	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1864
1865	free(vsw->sw_group);
1866	free(vsw->sw_name);
1867	free(vsw);
1868}
1869
1870struct vmd_switch *
1871switch_getbyname(const char *name)
1872{
1873	struct vmd_switch	*vsw;
1874
1875	if (name == NULL)
1876		return (NULL);
1877	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1878		if (strcmp(vsw->sw_name, name) == 0)
1879			return (vsw);
1880	}
1881
1882	return (NULL);
1883}
1884
1885struct vmd_user *
1886user_get(uid_t uid)
1887{
1888	struct vmd_user		*usr;
1889
1890	if (uid == 0)
1891		return (NULL);
1892
1893	/* first try to find an existing user */
1894	TAILQ_FOREACH(usr, env->vmd_users, usr_entry) {
1895		if (usr->usr_id.uid == uid)
1896			goto done;
1897	}
1898
1899	if ((usr = calloc(1, sizeof(*usr))) == NULL) {
1900		log_warn("could not allocate user");
1901		return (NULL);
1902	}
1903
1904	usr->usr_id.uid = uid;
1905	usr->usr_id.gid = -1;
1906	TAILQ_INSERT_TAIL(env->vmd_users, usr, usr_entry);
1907
1908 done:
1909	DPRINTF("%s: uid %d #%d +",
1910	    __func__, usr->usr_id.uid, usr->usr_refcnt + 1);
1911	usr->usr_refcnt++;
1912
1913	return (usr);
1914}
1915
1916void
1917user_put(struct vmd_user *usr)
1918{
1919	if (usr == NULL)
1920		return;
1921
1922	DPRINTF("%s: uid %d #%d -",
1923	    __func__, usr->usr_id.uid, usr->usr_refcnt - 1);
1924
1925	if (--usr->usr_refcnt > 0)
1926		return;
1927
1928	TAILQ_REMOVE(env->vmd_users, usr, usr_entry);
1929	free(usr);
1930}
1931
1932void
1933user_inc(struct vm_create_params *vcp, struct vmd_user *usr, int inc)
1934{
1935	char	 mem[FMT_SCALED_STRSIZE];
1936
1937	if (usr == NULL)
1938		return;
1939
1940	/* increment or decrement counters */
1941	inc = inc ? 1 : -1;
1942
1943	usr->usr_maxcpu += vcp->vcp_ncpus * inc;
1944	usr->usr_maxmem += vcp->vcp_memranges[0].vmr_size * inc;
1945	usr->usr_maxifs += vcp->vcp_nnics * inc;
1946
1947	if (log_getverbose() > 1) {
1948		(void)fmt_scaled(usr->usr_maxmem * 1024 * 1024, mem);
1949		log_debug("%s: %c uid %d ref %d cpu %llu mem %s ifs %llu",
1950		    __func__, inc == 1 ? '+' : '-',
1951		    usr->usr_id.uid, usr->usr_refcnt,
1952		    usr->usr_maxcpu, mem, usr->usr_maxifs);
1953	}
1954}
1955
1956int
1957user_checklimit(struct vmd_user *usr, struct vm_create_params *vcp)
1958{
1959	const char	*limit = "";
1960
1961	/* XXX make the limits configurable */
1962	if (usr->usr_maxcpu > VM_DEFAULT_USER_MAXCPU) {
1963		limit = "cpu ";
1964		goto fail;
1965	}
1966	if (usr->usr_maxmem > VM_DEFAULT_USER_MAXMEM) {
1967		limit = "memory ";
1968		goto fail;
1969	}
1970	if (usr->usr_maxifs > VM_DEFAULT_USER_MAXIFS) {
1971		limit = "interface ";
1972		goto fail;
1973	}
1974
1975	return (0);
1976
1977 fail:
1978	log_warnx("%s: user %d %slimit reached", vcp->vcp_name,
1979	    usr->usr_id.uid, limit);
1980	return (-1);
1981}
1982
1983char *
1984get_string(uint8_t *ptr, size_t len)
1985{
1986	size_t	 i;
1987
1988	for (i = 0; i < len; i++)
1989		if (!isprint(ptr[i]))
1990			break;
1991
1992	return strndup(ptr, i);
1993}
1994
1995uint32_t
1996prefixlen2mask(uint8_t prefixlen)
1997{
1998	if (prefixlen == 0)
1999		return (0);
2000
2001	if (prefixlen > 32)
2002		prefixlen = 32;
2003
2004	return (htonl(0xffffffff << (32 - prefixlen)));
2005}
2006
2007void
2008prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask)
2009{
2010	struct in6_addr	 s6;
2011	int		 i;
2012
2013	if (prefixlen > 128)
2014		prefixlen = 128;
2015
2016	memset(&s6, 0, sizeof(s6));
2017	for (i = 0; i < prefixlen / 8; i++)
2018		s6.s6_addr[i] = 0xff;
2019	i = prefixlen % 8;
2020	if (i)
2021		s6.s6_addr[prefixlen / 8] = 0xff00 >> i;
2022
2023	memcpy(mask, &s6, sizeof(s6));
2024}
2025
2026void
2027getmonotime(struct timeval *tv)
2028{
2029	struct timespec	 ts;
2030
2031	if (clock_gettime(CLOCK_MONOTONIC, &ts))
2032		fatal("clock_gettime");
2033
2034	TIMESPEC_TO_TIMEVAL(tv, &ts);
2035}
2036