vmd.c revision 1.120
1/*	$OpenBSD: vmd.c,v 1.120 2021/01/27 07:21:54 deraadt Exp $	*/
2
3/*
4 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/param.h>	/* nitems */
20#include <sys/queue.h>
21#include <sys/wait.h>
22#include <sys/cdefs.h>
23#include <sys/stat.h>
24#include <sys/sysctl.h>
25#include <sys/tty.h>
26#include <sys/ttycom.h>
27#include <sys/ioctl.h>
28
29#include <stdio.h>
30#include <stdlib.h>
31#include <string.h>
32#include <termios.h>
33#include <errno.h>
34#include <event.h>
35#include <fcntl.h>
36#include <pwd.h>
37#include <signal.h>
38#include <syslog.h>
39#include <unistd.h>
40#include <util.h>
41#include <ctype.h>
42#include <pwd.h>
43#include <grp.h>
44
45#include <machine/specialreg.h>
46#include <machine/vmmvar.h>
47
48#include "proc.h"
49#include "atomicio.h"
50#include "vmd.h"
51
52__dead void usage(void);
53
54int	 main(int, char **);
55int	 vmd_configure(void);
56void	 vmd_sighdlr(int sig, short event, void *arg);
57void	 vmd_shutdown(void);
58int	 vmd_control_run(void);
59int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
60int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
61int	 vmd_check_vmh(struct vm_dump_header *);
62
63int	 vm_instance(struct privsep *, struct vmd_vm **,
64	    struct vmop_create_params *, uid_t);
65int	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
66int	 vm_claimid(const char *, int, uint32_t *);
67void	 start_vm_batch(int, short, void*);
68
69struct vmd	*env;
70
71static struct privsep_proc procs[] = {
72	/* Keep "priv" on top as procs[0] */
73	{ "priv",	PROC_PRIV,	NULL, priv },
74	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
75	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm, vmm_shutdown },
76};
77
78enum privsep_procid privsep_process;
79
80struct event staggered_start_timer;
81
82/* For the privileged process */
83static struct privsep_proc *proc_priv = &procs[0];
84static struct passwd proc_privpw;
85static const uint8_t zero_mac[ETHER_ADDR_LEN];
86
87int
88vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
89{
90	struct privsep			*ps = p->p_ps;
91	int				 res = 0, ret = 0, cmd = 0, verbose;
92	unsigned int			 v = 0, flags;
93	struct vmop_create_params	 vmc;
94	struct vmop_id			 vid;
95	struct vmop_result		 vmr;
96	struct vm_dump_header		 vmh;
97	struct vmd_vm			*vm = NULL;
98	char				*str = NULL;
99	uint32_t			 id = 0;
100	struct control_sock		*rcs;
101
102	switch (imsg->hdr.type) {
103	case IMSG_VMDOP_START_VM_REQUEST:
104		IMSG_SIZE_CHECK(imsg, &vmc);
105		memcpy(&vmc, imsg->data, sizeof(vmc));
106		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
107		if (vmc.vmc_flags == 0) {
108			/* start an existing VM with pre-configured options */
109			if (!(ret == -1 && errno == EALREADY &&
110			    !(vm->vm_state & VM_STATE_RUNNING))) {
111				res = errno;
112				cmd = IMSG_VMDOP_START_VM_RESPONSE;
113			}
114		} else if (ret != 0) {
115			res = errno;
116			cmd = IMSG_VMDOP_START_VM_RESPONSE;
117		}
118		if (res == 0 &&
119		    config_setvm(ps, vm,
120		    imsg->hdr.peerid, vm->vm_params.vmc_owner.uid) == -1) {
121			res = errno;
122			cmd = IMSG_VMDOP_START_VM_RESPONSE;
123		}
124		break;
125	case IMSG_VMDOP_WAIT_VM_REQUEST:
126	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
127		IMSG_SIZE_CHECK(imsg, &vid);
128		memcpy(&vid, imsg->data, sizeof(vid));
129		flags = vid.vid_flags;
130
131		if ((id = vid.vid_id) == 0) {
132			/* Lookup vm (id) by name */
133			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
134				res = ENOENT;
135				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
136				break;
137			} else if ((vm->vm_state & VM_STATE_SHUTDOWN) &&
138			    (flags & VMOP_FORCE) == 0) {
139				res = EALREADY;
140				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
141				break;
142			} else if (!(vm->vm_state & VM_STATE_RUNNING)) {
143				res = EINVAL;
144				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
145				break;
146			}
147			id = vm->vm_vmid;
148		} else if ((vm = vm_getbyvmid(id)) == NULL) {
149			res = ENOENT;
150			cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
151			break;
152		}
153		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
154		    vid.vid_uid) != 0) {
155			res = EPERM;
156			cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
157			break;
158		}
159
160		memset(&vid, 0, sizeof(vid));
161		vid.vid_id = id;
162		vid.vid_flags = flags;
163		if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
164		    imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
165			return (-1);
166		break;
167	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
168		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
169		break;
170	case IMSG_VMDOP_LOAD:
171		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
172		str = get_string((uint8_t *)imsg->data,
173		    IMSG_DATA_SIZE(imsg));
174	case IMSG_VMDOP_RELOAD:
175		if (vmd_reload(0, str) == -1)
176			cmd = IMSG_CTL_FAIL;
177		else
178			cmd = IMSG_CTL_OK;
179		free(str);
180		break;
181	case IMSG_CTL_RESET:
182		IMSG_SIZE_CHECK(imsg, &v);
183		memcpy(&v, imsg->data, sizeof(v));
184		if (vmd_reload(v, NULL) == -1)
185			cmd = IMSG_CTL_FAIL;
186		else
187			cmd = IMSG_CTL_OK;
188		break;
189	case IMSG_CTL_VERBOSE:
190		IMSG_SIZE_CHECK(imsg, &verbose);
191		memcpy(&verbose, imsg->data, sizeof(verbose));
192		log_setverbose(verbose);
193
194		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
195		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
196		cmd = IMSG_CTL_OK;
197		break;
198	case IMSG_VMDOP_PAUSE_VM:
199	case IMSG_VMDOP_UNPAUSE_VM:
200		IMSG_SIZE_CHECK(imsg, &vid);
201		memcpy(&vid, imsg->data, sizeof(vid));
202		if (vid.vid_id == 0) {
203			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
204				res = ENOENT;
205				cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
206				break;
207			} else {
208				vid.vid_id = vm->vm_vmid;
209			}
210		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
211			res = ENOENT;
212			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
213			break;
214		}
215		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
216		    vid.vid_uid) != 0) {
217			res = EPERM;
218			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
219			break;
220		}
221		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
222		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
223		break;
224	case IMSG_VMDOP_SEND_VM_REQUEST:
225		IMSG_SIZE_CHECK(imsg, &vid);
226		memcpy(&vid, imsg->data, sizeof(vid));
227		id = vid.vid_id;
228		if (vid.vid_id == 0) {
229			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
230				res = ENOENT;
231				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
232				close(imsg->fd);
233				break;
234			} else {
235				vid.vid_id = vm->vm_vmid;
236			}
237		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
238			res = ENOENT;
239			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
240			close(imsg->fd);
241			break;
242		}
243		vmr.vmr_id = vid.vid_id;
244		log_debug("%s: sending fd to vmm", __func__);
245		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
246		    imsg->hdr.peerid, imsg->fd, &vid, sizeof(vid));
247		break;
248	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
249		IMSG_SIZE_CHECK(imsg, &vid);
250		memcpy(&vid, imsg->data, sizeof(vid));
251		if (imsg->fd == -1) {
252			log_warnx("%s: invalid fd", __func__);
253			return (-1);
254		}
255		if (atomicio(read, imsg->fd, &vmh, sizeof(vmh)) !=
256		    sizeof(vmh)) {
257			log_warnx("%s: error reading vmh from received vm",
258			    __func__);
259			res = EIO;
260			close(imsg->fd);
261			cmd = IMSG_VMDOP_START_VM_RESPONSE;
262			break;
263		}
264
265		if (vmd_check_vmh(&vmh)) {
266			res = ENOENT;
267			close(imsg->fd);
268			cmd = IMSG_VMDOP_START_VM_RESPONSE;
269			break;
270		}
271		if (atomicio(read, imsg->fd, &vmc, sizeof(vmc)) !=
272		    sizeof(vmc)) {
273			log_warnx("%s: error reading vmc from received vm",
274			    __func__);
275			res = EIO;
276			close(imsg->fd);
277			cmd = IMSG_VMDOP_START_VM_RESPONSE;
278			break;
279		}
280		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
281		    sizeof(vmc.vmc_params.vcp_name));
282		vmc.vmc_params.vcp_id = 0;
283
284		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
285		if (ret != 0) {
286			res = errno;
287			cmd = IMSG_VMDOP_START_VM_RESPONSE;
288			close(imsg->fd);
289		} else {
290			vm->vm_state |= VM_STATE_RECEIVED;
291			config_setvm(ps, vm, imsg->hdr.peerid,
292			    vmc.vmc_owner.uid);
293			log_debug("%s: sending fd to vmm", __func__);
294			proc_compose_imsg(ps, PROC_VMM, -1,
295			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, imsg->fd,
296			    NULL, 0);
297		}
298		break;
299	case IMSG_VMDOP_DONE:
300		control_reset(&ps->ps_csock);
301		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
302			control_reset(rcs);
303		cmd = 0;
304		break;
305	default:
306		return (-1);
307	}
308
309	switch (cmd) {
310	case 0:
311		break;
312	case IMSG_VMDOP_START_VM_RESPONSE:
313	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
314		memset(&vmr, 0, sizeof(vmr));
315		vmr.vmr_result = res;
316		vmr.vmr_id = id;
317		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
318		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
319			return (-1);
320		break;
321	default:
322		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
323		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
324			return (-1);
325		break;
326	}
327
328	return (0);
329}
330
331int
332vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
333{
334	struct vmop_result	 vmr;
335	struct privsep		*ps = p->p_ps;
336	int			 res = 0;
337	struct vmd_vm		*vm;
338	struct vm_create_params	*vcp;
339	struct vmop_info_result	 vir;
340
341	switch (imsg->hdr.type) {
342	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
343		IMSG_SIZE_CHECK(imsg, &vmr);
344		memcpy(&vmr, imsg->data, sizeof(vmr));
345		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
346			break;
347		proc_compose_imsg(ps, PROC_CONTROL, -1,
348		    imsg->hdr.type, imsg->hdr.peerid, -1,
349		    imsg->data, sizeof(imsg->data));
350		log_info("%s: paused vm %d successfully",
351		    vm->vm_params.vmc_params.vcp_name,
352		    vm->vm_vmid);
353		vm->vm_state |= VM_STATE_PAUSED;
354		break;
355	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
356		IMSG_SIZE_CHECK(imsg, &vmr);
357		memcpy(&vmr, imsg->data, sizeof(vmr));
358		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
359			break;
360		proc_compose_imsg(ps, PROC_CONTROL, -1,
361		    imsg->hdr.type, imsg->hdr.peerid, -1,
362		    imsg->data, sizeof(imsg->data));
363		log_info("%s: unpaused vm %d successfully.",
364		    vm->vm_params.vmc_params.vcp_name,
365		    vm->vm_vmid);
366		vm->vm_state &= ~VM_STATE_PAUSED;
367		break;
368	case IMSG_VMDOP_START_VM_RESPONSE:
369		IMSG_SIZE_CHECK(imsg, &vmr);
370		memcpy(&vmr, imsg->data, sizeof(vmr));
371		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
372			break;
373		vm->vm_pid = vmr.vmr_pid;
374		vcp = &vm->vm_params.vmc_params;
375		vcp->vcp_id = vmr.vmr_id;
376
377		/*
378		 * If the peerid is not -1, forward the response back to the
379		 * the control socket.  If it is -1, the request originated
380		 * from the parent, not the control socket.
381		 */
382		if (vm->vm_peerid != (uint32_t)-1) {
383			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
384			    sizeof(vmr.vmr_ttyname));
385			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
386			    imsg->hdr.type, vm->vm_peerid, -1,
387			    &vmr, sizeof(vmr)) == -1) {
388				errno = vmr.vmr_result;
389				log_warn("%s: failed to foward vm result",
390				    vcp->vcp_name);
391				vm_remove(vm, __func__);
392				return (-1);
393			}
394		}
395
396		if (vmr.vmr_result) {
397			errno = vmr.vmr_result;
398			log_warn("%s: failed to start vm", vcp->vcp_name);
399			vm_remove(vm, __func__);
400			break;
401		}
402
403		/* Now configure all the interfaces */
404		if (vm_priv_ifconfig(ps, vm) == -1) {
405			log_warn("%s: failed to configure vm", vcp->vcp_name);
406			vm_remove(vm, __func__);
407			break;
408		}
409
410		log_info("%s: started vm %d successfully, tty %s",
411		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
412		break;
413	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
414		IMSG_SIZE_CHECK(imsg, &vmr);
415		memcpy(&vmr, imsg->data, sizeof(vmr));
416		DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
417		    __func__, vmr.vmr_id);
418		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
419		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
420			break;
421		if (vmr.vmr_result == 0) {
422			/* Mark VM as shutting down */
423			vm->vm_state |= VM_STATE_SHUTDOWN;
424		}
425		break;
426	case IMSG_VMDOP_SEND_VM_RESPONSE:
427		IMSG_SIZE_CHECK(imsg, &vmr);
428		memcpy(&vmr, imsg->data, sizeof(vmr));
429		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
430			break;
431		if (!vmr.vmr_result) {
432			log_info("%s: sent vm %d successfully.",
433			    vm->vm_params.vmc_params.vcp_name,
434			    vm->vm_vmid);
435			if (vm->vm_from_config)
436				vm_stop(vm, 0, __func__);
437			else
438				vm_remove(vm, __func__);
439		}
440
441		/* Send a response if a control client is waiting for it */
442		if (imsg->hdr.peerid != (uint32_t)-1) {
443			/* the error is meaningless for deferred responses */
444			vmr.vmr_result = 0;
445
446			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
447			    IMSG_VMDOP_SEND_VM_RESPONSE,
448			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
449				return (-1);
450		}
451		break;
452	case IMSG_VMDOP_TERMINATE_VM_EVENT:
453		IMSG_SIZE_CHECK(imsg, &vmr);
454		memcpy(&vmr, imsg->data, sizeof(vmr));
455		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
456		    __func__, vmr.vmr_id, vmr.vmr_result);
457		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
458			log_debug("%s: vm %d is no longer available",
459			    __func__, vmr.vmr_id);
460			break;
461		}
462		if (vmr.vmr_result != EAGAIN ||
463		    vm->vm_params.vmc_bootdevice) {
464			if (vm->vm_from_config)
465				vm_stop(vm, 0, __func__);
466			else
467				vm_remove(vm, __func__);
468		} else {
469			/* Stop VM instance but keep the tty open */
470			vm_stop(vm, 1, __func__);
471			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
472		}
473
474		/* Send a response if a control client is waiting for it */
475		if (imsg->hdr.peerid != (uint32_t)-1) {
476			/* the error is meaningless for deferred responses */
477			vmr.vmr_result = 0;
478
479			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
480			    IMSG_VMDOP_TERMINATE_VM_RESPONSE,
481			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
482				return (-1);
483		}
484		break;
485	case IMSG_VMDOP_GET_INFO_VM_DATA:
486		IMSG_SIZE_CHECK(imsg, &vir);
487		memcpy(&vir, imsg->data, sizeof(vir));
488		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
489			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
490			if (vm->vm_ttyname != NULL)
491				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
492				    sizeof(vir.vir_ttyname));
493			log_debug("%s: running vm: %d, vm_state: 0x%x",
494			    __func__, vm->vm_vmid, vm->vm_state);
495			vir.vir_state = vm->vm_state;
496			/* get the user id who started the vm */
497			vir.vir_uid = vm->vm_uid;
498			vir.vir_gid = vm->vm_params.vmc_owner.gid;
499		}
500		if (proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type,
501		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
502			log_debug("%s: GET_INFO_VM failed for vm %d, removing",
503			    __func__, vm->vm_vmid);
504			vm_remove(vm, __func__);
505			return (-1);
506		}
507		break;
508	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
509		/*
510		 * PROC_VMM has responded with the *running* VMs, now we
511		 * append the others. These use the special value 0 for their
512		 * kernel id to indicate that they are not running.
513		 */
514		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
515			if (!(vm->vm_state & VM_STATE_RUNNING)) {
516				memset(&vir, 0, sizeof(vir));
517				vir.vir_info.vir_id = vm->vm_vmid;
518				strlcpy(vir.vir_info.vir_name,
519				    vm->vm_params.vmc_params.vcp_name,
520				    VMM_MAX_NAME_LEN);
521				vir.vir_info.vir_memory_size =
522				    vm->vm_params.vmc_params.
523				    vcp_memranges[0].vmr_size;
524				vir.vir_info.vir_ncpus =
525				    vm->vm_params.vmc_params.vcp_ncpus;
526				/* get the configured user id for this vm */
527				vir.vir_uid = vm->vm_params.vmc_owner.uid;
528				vir.vir_gid = vm->vm_params.vmc_owner.gid;
529				log_debug("%s: vm: %d, vm_state: 0x%x",
530				    __func__, vm->vm_vmid, vm->vm_state);
531				vir.vir_state = vm->vm_state;
532				if (proc_compose_imsg(ps, PROC_CONTROL, -1,
533				    IMSG_VMDOP_GET_INFO_VM_DATA,
534				    imsg->hdr.peerid, -1, &vir,
535				    sizeof(vir)) == -1) {
536					log_debug("%s: GET_INFO_VM_END failed",
537					    __func__);
538					vm_remove(vm, __func__);
539					return (-1);
540				}
541			}
542		}
543		IMSG_SIZE_CHECK(imsg, &res);
544		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
545		break;
546	default:
547		return (-1);
548	}
549
550	return (0);
551}
552
553int
554vmd_check_vmh(struct vm_dump_header *vmh)
555{
556	int i;
557	unsigned int code, leaf;
558	unsigned int a, b, c, d;
559
560	if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
561		log_warnx("%s: incompatible dump signature", __func__);
562		return (-1);
563	}
564
565	if (vmh->vmh_version != VM_DUMP_VERSION) {
566		log_warnx("%s: incompatible dump version", __func__);
567		return (-1);
568	}
569
570	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
571		code = vmh->vmh_cpuids[i].code;
572		leaf = vmh->vmh_cpuids[i].leaf;
573		if (leaf != 0x00) {
574			log_debug("%s: invalid leaf 0x%x for code 0x%x",
575			    __func__, leaf, code);
576			return (-1);
577		}
578
579		switch (code) {
580		case 0x00:
581			CPUID_LEAF(code, leaf, a, b, c, d);
582			if (vmh->vmh_cpuids[i].a > a) {
583				log_debug("%s: incompatible cpuid level",
584				    __func__);
585				return (-1);
586			}
587			if (!(vmh->vmh_cpuids[i].b == b &&
588			    vmh->vmh_cpuids[i].c == c &&
589			    vmh->vmh_cpuids[i].d == d)) {
590				log_debug("%s: incompatible cpu brand",
591				    __func__);
592				return (-1);
593			}
594			break;
595
596		case 0x01:
597			CPUID_LEAF(code, leaf, a, b, c, d);
598			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
599			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
600				log_debug("%s: incompatible cpu features "
601				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
602				    code, leaf);
603				return (-1);
604			}
605			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
606			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
607				log_debug("%s: incompatible cpu features "
608				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
609				    code, leaf);
610				return (-1);
611			}
612			break;
613
614		case 0x07:
615			CPUID_LEAF(code, leaf, a, b, c, d);
616			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
617			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
618				log_debug("%s: incompatible cpu features "
619				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
620				    code, leaf);
621				return (-1);
622			}
623			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
624			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
625				log_debug("%s: incompatible cpu features "
626				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
627				    code, leaf);
628				return (-1);
629			}
630			break;
631
632		case 0x0d:
633			CPUID_LEAF(code, leaf, a, b, c, d);
634			if (vmh->vmh_cpuids[i].b > b) {
635				log_debug("%s: incompatible cpu: insufficient "
636				    "max save area for enabled XCR0 features",
637				    __func__);
638				return (-1);
639			}
640			if (vmh->vmh_cpuids[i].c > c) {
641				log_debug("%s: incompatible cpu: insufficient "
642				    "max save area for supported XCR0 features",
643				    __func__);
644				return (-1);
645			}
646			break;
647
648		case 0x80000001:
649			CPUID_LEAF(code, leaf, a, b, c, d);
650			if ((vmh->vmh_cpuids[i].a & a) !=
651			    vmh->vmh_cpuids[i].a) {
652				log_debug("%s: incompatible cpu features "
653				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
654				    code, leaf);
655				return (-1);
656			}
657			if ((vmh->vmh_cpuids[i].c & c) !=
658			    vmh->vmh_cpuids[i].c) {
659				log_debug("%s: incompatible cpu features "
660				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
661				    code, leaf);
662				return (-1);
663			}
664			if ((vmh->vmh_cpuids[i].d & d) !=
665			    vmh->vmh_cpuids[i].d) {
666				log_debug("%s: incompatible cpu features "
667				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
668				    code, leaf);
669				return (-1);
670			}
671			break;
672
673		default:
674			log_debug("%s: unknown code 0x%x", __func__, code);
675			return (-1);
676		}
677	}
678
679	return (0);
680}
681
682void
683vmd_sighdlr(int sig, short event, void *arg)
684{
685	if (privsep_process != PROC_PARENT)
686		return;
687	log_debug("%s: handling signal", __func__);
688
689	switch (sig) {
690	case SIGHUP:
691		log_info("%s: reload requested with SIGHUP", __func__);
692
693		/*
694		 * This is safe because libevent uses async signal handlers
695		 * that run in the event loop and not in signal context.
696		 */
697		(void)vmd_reload(0, NULL);
698		break;
699	case SIGPIPE:
700		log_info("%s: ignoring SIGPIPE", __func__);
701		break;
702	case SIGUSR1:
703		log_info("%s: ignoring SIGUSR1", __func__);
704		break;
705	case SIGTERM:
706	case SIGINT:
707		vmd_shutdown();
708		break;
709	default:
710		fatalx("unexpected signal");
711	}
712}
713
714__dead void
715usage(void)
716{
717	extern char *__progname;
718	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
719	    __progname);
720	exit(1);
721}
722
723int
724main(int argc, char **argv)
725{
726	struct privsep		*ps;
727	int			 ch;
728	const char		*conffile = VMD_CONF;
729	enum privsep_procid	 proc_id = PROC_PARENT;
730	int			 proc_instance = 0;
731	const char		*errp, *title = NULL;
732	int			 argc0 = argc;
733
734	log_init(0, LOG_DAEMON);
735
736	if ((env = calloc(1, sizeof(*env))) == NULL)
737		fatal("calloc: env");
738
739	while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) {
740		switch (ch) {
741		case 'D':
742			if (cmdline_symset(optarg) < 0)
743				log_warnx("could not parse macro definition %s",
744				    optarg);
745			break;
746		case 'd':
747			env->vmd_debug = 2;
748			break;
749		case 'f':
750			conffile = optarg;
751			break;
752		case 'v':
753			env->vmd_verbose++;
754			break;
755		case 'n':
756			env->vmd_noaction = 1;
757			break;
758		case 'P':
759			title = optarg;
760			proc_id = proc_getid(procs, nitems(procs), title);
761			if (proc_id == PROC_MAX)
762				fatalx("invalid process name");
763			break;
764		case 'I':
765			proc_instance = strtonum(optarg, 0,
766			    PROC_MAX_INSTANCES, &errp);
767			if (errp)
768				fatalx("invalid process instance");
769			break;
770		default:
771			usage();
772		}
773	}
774
775	argc -= optind;
776	if (argc > 0)
777		usage();
778
779	if (env->vmd_noaction && !env->vmd_debug)
780		env->vmd_debug = 1;
781
782	/* check for root privileges */
783	if (env->vmd_noaction == 0) {
784		if (geteuid())
785			fatalx("need root privileges");
786	}
787
788	ps = &env->vmd_ps;
789	ps->ps_env = env;
790	env->vmd_fd = -1;
791
792	if (config_init(env) == -1)
793		fatal("failed to initialize configuration");
794
795	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
796		fatal("unknown user %s", VMD_USER);
797
798	/* First proc runs as root without pledge but in default chroot */
799	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
800	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
801
802	/* Open /dev/vmm */
803	if (env->vmd_noaction == 0) {
804		env->vmd_fd = open(VMM_NODE, O_RDWR);
805		if (env->vmd_fd == -1)
806			fatal("%s", VMM_NODE);
807	}
808
809	/* Configure the control socket */
810	ps->ps_csock.cs_name = SOCKET_NAME;
811	TAILQ_INIT(&ps->ps_rcsocks);
812
813	/* Configuration will be parsed after forking the children */
814	env->vmd_conffile = conffile;
815
816	log_init(env->vmd_debug, LOG_DAEMON);
817	log_setverbose(env->vmd_verbose);
818
819	if (env->vmd_noaction)
820		ps->ps_noaction = 1;
821	ps->ps_instance = proc_instance;
822	if (title != NULL)
823		ps->ps_title[proc_id] = title;
824
825	/* only the parent returns */
826	proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv,
827	    proc_id);
828
829	log_procinit("parent");
830	if (!env->vmd_debug && daemon(0, 0) == -1)
831		fatal("can't daemonize");
832
833	if (ps->ps_noaction == 0)
834		log_info("startup");
835
836	event_init();
837
838	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
839	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
840	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
841	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
842	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
843
844	signal_add(&ps->ps_evsigint, NULL);
845	signal_add(&ps->ps_evsigterm, NULL);
846	signal_add(&ps->ps_evsighup, NULL);
847	signal_add(&ps->ps_evsigpipe, NULL);
848	signal_add(&ps->ps_evsigusr1, NULL);
849
850	if (!env->vmd_noaction)
851		proc_connect(ps);
852
853	if (vmd_configure() == -1)
854		fatalx("configuration failed");
855
856	event_dispatch();
857
858	log_debug("parent exiting");
859
860	return (0);
861}
862
863void
864start_vm_batch(int fd, short type, void *args)
865{
866	int		i = 0;
867	struct vmd_vm	*vm;
868
869	log_debug("%s: starting batch of %d vms", __func__,
870	    env->vmd_cfg.parallelism);
871	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
872		if (!(vm->vm_state & VM_STATE_WAITING)) {
873			log_debug("%s: not starting vm %s (disabled)",
874			    __func__,
875			    vm->vm_params.vmc_params.vcp_name);
876			continue;
877		}
878		i++;
879		if (i > env->vmd_cfg.parallelism) {
880			evtimer_add(&staggered_start_timer,
881			    &env->vmd_cfg.delay);
882			break;
883		}
884		vm->vm_state &= ~VM_STATE_WAITING;
885		config_setvm(&env->vmd_ps, vm, -1, vm->vm_params.vmc_owner.uid);
886	}
887	log_debug("%s: done starting vms", __func__);
888}
889
890int
891vmd_configure(void)
892{
893	int			ncpus;
894	struct vmd_switch	*vsw;
895	int ncpu_mib[] = {CTL_HW, HW_NCPUONLINE};
896	size_t ncpus_sz = sizeof(ncpus);
897
898	if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
899		fatal("open %s", PATH_PTMDEV);
900
901	/*
902	 * pledge in the parent process:
903	 * stdio - for malloc and basic I/O including events.
904	 * rpath - for reload to open and read the configuration files.
905	 * wpath - for opening disk images and tap devices.
906	 * tty - for openpty and TIOCUCNTL.
907	 * proc - run kill to terminate its children safely.
908	 * sendfd - for disks, interfaces and other fds.
909	 * recvfd - for send and receive.
910	 * getpw - lookup user or group id by name.
911	 * chown, fattr - change tty ownership
912	 * flock - locking disk files
913	 */
914	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
915	    " chown fattr flock", NULL) == -1)
916		fatal("pledge");
917
918	if (parse_config(env->vmd_conffile) == -1) {
919		proc_kill(&env->vmd_ps);
920		exit(1);
921	}
922
923	if (env->vmd_noaction) {
924		fprintf(stderr, "configuration OK\n");
925		proc_kill(&env->vmd_ps);
926		exit(0);
927	}
928
929	/* Send shared global configuration to all children */
930	if (config_setconfig(env) == -1)
931		return (-1);
932
933	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
934		if (vsw->sw_running)
935			continue;
936		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
937			log_warn("%s: failed to create switch %s",
938			    __func__, vsw->sw_name);
939			switch_remove(vsw);
940			return (-1);
941		}
942	}
943
944	if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) {
945		env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY;
946		if (sysctl(ncpu_mib, NELEM(ncpu_mib), &ncpus, &ncpus_sz, NULL, 0) == -1)
947			ncpus = 1;
948		env->vmd_cfg.parallelism = ncpus;
949		log_debug("%s: setting staggered start configuration to "
950		    "parallelism: %d and delay: %lld",
951		    __func__, ncpus, (long long) env->vmd_cfg.delay.tv_sec);
952	}
953
954	log_debug("%s: starting vms in staggered fashion", __func__);
955	evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
956	/* start first batch */
957	start_vm_batch(0, 0, NULL);
958
959	return (0);
960}
961
962int
963vmd_reload(unsigned int reset, const char *filename)
964{
965	struct vmd_vm		*vm, *next_vm;
966	struct vmd_switch	*vsw;
967	int			 reload = 0;
968
969	/* Switch back to the default config file */
970	if (filename == NULL || *filename == '\0') {
971		filename = env->vmd_conffile;
972		reload = 1;
973	}
974
975	log_debug("%s: level %d config file %s", __func__, reset, filename);
976
977	if (reset) {
978		/* Purge the configuration */
979		config_purge(env, reset);
980		config_setreset(env, reset);
981	} else {
982		/*
983		 * Load or reload the configuration.
984		 *
985		 * Reloading removes all non-running VMs before processing the
986		 * config file, whereas loading only adds to the existing list
987		 * of VMs.
988		 */
989
990		if (reload) {
991			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
992			    next_vm) {
993				if (!(vm->vm_state & VM_STATE_RUNNING)) {
994					DPRINTF("%s: calling vm_remove",
995					    __func__);
996					vm_remove(vm, __func__);
997				}
998			}
999		}
1000
1001		if (parse_config(filename) == -1) {
1002			log_debug("%s: failed to load config file %s",
1003			    __func__, filename);
1004			return (-1);
1005		}
1006
1007		if (reload) {
1008			/* Update shared global configuration in all children */
1009			if (config_setconfig(env) == -1)
1010				return (-1);
1011		}
1012
1013		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1014			if (vsw->sw_running)
1015				continue;
1016			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1017				log_warn("%s: failed to create switch %s",
1018				    __func__, vsw->sw_name);
1019				switch_remove(vsw);
1020				return (-1);
1021			}
1022		}
1023
1024		log_debug("%s: starting vms in staggered fashion", __func__);
1025		evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1026		/* start first batch */
1027		start_vm_batch(0, 0, NULL);
1028
1029		}
1030
1031	return (0);
1032}
1033
1034void
1035vmd_shutdown(void)
1036{
1037	struct vmd_vm *vm, *vm_next;
1038
1039	log_debug("%s: performing shutdown", __func__);
1040
1041	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
1042		vm_remove(vm, __func__);
1043	}
1044
1045	proc_kill(&env->vmd_ps);
1046	free(env);
1047
1048	log_warnx("parent terminating");
1049	exit(0);
1050}
1051
1052struct vmd_vm *
1053vm_getbyvmid(uint32_t vmid)
1054{
1055	struct vmd_vm	*vm;
1056
1057	if (vmid == 0)
1058		return (NULL);
1059	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1060		if (vm->vm_vmid == vmid)
1061			return (vm);
1062	}
1063
1064	return (NULL);
1065}
1066
1067struct vmd_vm *
1068vm_getbyid(uint32_t id)
1069{
1070	struct vmd_vm	*vm;
1071
1072	if (id == 0)
1073		return (NULL);
1074	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1075		if (vm->vm_params.vmc_params.vcp_id == id)
1076			return (vm);
1077	}
1078
1079	return (NULL);
1080}
1081
1082uint32_t
1083vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1084{
1085	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1086		return (0);
1087	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1088	    id, vm->vm_vmid);
1089	return (vm->vm_vmid);
1090}
1091
1092uint32_t
1093vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1094{
1095	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1096		return (0);
1097	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1098	    vmid, vm->vm_params.vmc_params.vcp_id);
1099	return (vm->vm_params.vmc_params.vcp_id);
1100}
1101
1102struct vmd_vm *
1103vm_getbyname(const char *name)
1104{
1105	struct vmd_vm	*vm;
1106
1107	if (name == NULL)
1108		return (NULL);
1109	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1110		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1111			return (vm);
1112	}
1113
1114	return (NULL);
1115}
1116
1117struct vmd_vm *
1118vm_getbypid(pid_t pid)
1119{
1120	struct vmd_vm	*vm;
1121
1122	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1123		if (vm->vm_pid == pid)
1124			return (vm);
1125	}
1126
1127	return (NULL);
1128}
1129
1130void
1131vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1132{
1133	struct privsep	*ps = &env->vmd_ps;
1134	unsigned int	 i, j;
1135
1136	if (vm == NULL)
1137		return;
1138
1139	log_debug("%s: %s %s stopping vm %d%s",
1140	    __func__, ps->ps_title[privsep_process], caller,
1141	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1142
1143	vm->vm_state &= ~(VM_STATE_RUNNING | VM_STATE_SHUTDOWN);
1144
1145	user_inc(&vm->vm_params.vmc_params, vm->vm_user, 0);
1146	user_put(vm->vm_user);
1147
1148	if (vm->vm_iev.ibuf.fd != -1) {
1149		event_del(&vm->vm_iev.ev);
1150		close(vm->vm_iev.ibuf.fd);
1151	}
1152	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
1153		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
1154			if (vm->vm_disks[i][j] != -1) {
1155				close(vm->vm_disks[i][j]);
1156				vm->vm_disks[i][j] = -1;
1157			}
1158		}
1159	}
1160	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
1161		if (vm->vm_ifs[i].vif_fd != -1) {
1162			close(vm->vm_ifs[i].vif_fd);
1163			vm->vm_ifs[i].vif_fd = -1;
1164		}
1165		free(vm->vm_ifs[i].vif_name);
1166		free(vm->vm_ifs[i].vif_switch);
1167		free(vm->vm_ifs[i].vif_group);
1168		vm->vm_ifs[i].vif_name = NULL;
1169		vm->vm_ifs[i].vif_switch = NULL;
1170		vm->vm_ifs[i].vif_group = NULL;
1171	}
1172	if (vm->vm_kernel != -1) {
1173		close(vm->vm_kernel);
1174		vm->vm_kernel = -1;
1175	}
1176	if (vm->vm_cdrom != -1) {
1177		close(vm->vm_cdrom);
1178		vm->vm_cdrom = -1;
1179	}
1180	if (!keeptty) {
1181		vm_closetty(vm);
1182		vm->vm_uid = 0;
1183	}
1184}
1185
1186void
1187vm_remove(struct vmd_vm *vm, const char *caller)
1188{
1189	struct privsep	*ps = &env->vmd_ps;
1190
1191	if (vm == NULL)
1192		return;
1193
1194	log_debug("%s: %s %s removing vm %d from running config",
1195	    __func__, ps->ps_title[privsep_process], caller,
1196	    vm->vm_vmid);
1197
1198	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1199
1200	user_put(vm->vm_user);
1201	vm_stop(vm, 0, caller);
1202	free(vm);
1203}
1204
1205int
1206vm_claimid(const char *name, int uid, uint32_t *id)
1207{
1208	struct name2id *n2i = NULL;
1209
1210	TAILQ_FOREACH(n2i, env->vmd_known, entry)
1211		if (strcmp(n2i->name, name) == 0 && n2i->uid == uid)
1212			goto out;
1213
1214	if (++env->vmd_nvm == 0) {
1215		log_warnx("too many vms");
1216		return -1;
1217	}
1218	if ((n2i = calloc(1, sizeof(struct name2id))) == NULL) {
1219		log_warnx("could not alloc vm name");
1220		return -1;
1221	}
1222	n2i->id = env->vmd_nvm;
1223	n2i->uid = uid;
1224	if (strlcpy(n2i->name, name, sizeof(n2i->name)) >= sizeof(n2i->name)) {
1225		log_warnx("vm name too long");
1226		free(n2i);
1227		return -1;
1228	}
1229	TAILQ_INSERT_TAIL(env->vmd_known, n2i, entry);
1230
1231out:
1232	*id = n2i->id;
1233	return 0;
1234}
1235
1236int
1237vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1238    struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1239{
1240	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1241	struct vm_create_params	*vcp = &vmc->vmc_params;
1242	struct vmop_owner	*vmo = NULL;
1243	struct vmd_user		*usr = NULL;
1244	uint32_t		 nid, rng;
1245	unsigned int		 i, j;
1246	struct vmd_switch	*sw;
1247	char			*s;
1248
1249	/* Check if this is an instance of another VM */
1250	if (vm_instance(ps, &vm_parent, vmc, uid) == -1)
1251		return (-1);
1252
1253	errno = 0;
1254	*ret_vm = NULL;
1255
1256	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1257	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1258		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1259		    uid) != 0) {
1260			errno = EPERM;
1261			goto fail;
1262		}
1263		*ret_vm = vm;
1264		errno = EALREADY;
1265		goto fail;
1266	}
1267
1268	if (vm_parent != NULL)
1269		vmo = &vm_parent->vm_params.vmc_insowner;
1270
1271	/* non-root users can only start existing VMs or instances */
1272	if (vm_checkperm(NULL, vmo, uid) != 0) {
1273		log_warnx("permission denied");
1274		errno = EPERM;
1275		goto fail;
1276	}
1277	if (vmc->vmc_flags == 0) {
1278		log_warnx("invalid configuration, no devices");
1279		errno = VMD_DISK_MISSING;
1280		goto fail;
1281	}
1282	if (vcp->vcp_ncpus == 0)
1283		vcp->vcp_ncpus = 1;
1284	if (vcp->vcp_memranges[0].vmr_size == 0)
1285		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1286	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1287		log_warnx("invalid number of CPUs");
1288		goto fail;
1289	} else if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) {
1290		log_warnx("invalid number of disks");
1291		goto fail;
1292	} else if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) {
1293		log_warnx("invalid number of interfaces");
1294		goto fail;
1295	} else if (strlen(vcp->vcp_kernel) == 0 &&
1296	    vcp->vcp_ndisks == 0 && strlen(vcp->vcp_cdrom) == 0) {
1297		log_warnx("no kernel or disk/cdrom specified");
1298		goto fail;
1299	} else if (strlen(vcp->vcp_name) == 0) {
1300		log_warnx("invalid VM name");
1301		goto fail;
1302	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1303	    *vcp->vcp_name == '_') {
1304		log_warnx("invalid VM name");
1305		goto fail;
1306	} else {
1307		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1308			if (!(isalnum(*s) || *s == '.' || *s == '-' ||
1309			    *s == '_')) {
1310				log_warnx("invalid VM name");
1311				goto fail;
1312			}
1313		}
1314	}
1315
1316	/* track active users */
1317	if (uid != 0 && env->vmd_users != NULL &&
1318	    (usr = user_get(uid)) == NULL) {
1319		log_warnx("could not add user");
1320		goto fail;
1321	}
1322
1323	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1324		goto fail;
1325
1326	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1327	vmc = &vm->vm_params;
1328	vcp = &vmc->vmc_params;
1329	vm->vm_pid = -1;
1330	vm->vm_tty = -1;
1331	vm->vm_receive_fd = -1;
1332	vm->vm_state &= ~VM_STATE_PAUSED;
1333	vm->vm_user = usr;
1334
1335	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
1336		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
1337			vm->vm_disks[i][j] = -1;
1338	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
1339		vm->vm_ifs[i].vif_fd = -1;
1340	for (i = 0; i < vcp->vcp_nnics; i++) {
1341		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1342			/* inherit per-interface flags from the switch */
1343			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1344		}
1345
1346		/*
1347		 * If the MAC address is zero, always randomize it in vmd(8)
1348		 * because we cannot rely on the guest OS to do the right
1349		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1350		 * from the kernel, incremented by one to differentiate
1351		 * the source.
1352		 */
1353		if (memcmp(zero_mac, &vcp->vcp_macs[i], ETHER_ADDR_LEN) == 0) {
1354			rng = arc4random();
1355			vcp->vcp_macs[i][0] = 0xfe;
1356			vcp->vcp_macs[i][1] = 0xe1;
1357			vcp->vcp_macs[i][2] = 0xba + 1;
1358			vcp->vcp_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1359			vcp->vcp_macs[i][4] = rng;
1360			vcp->vcp_macs[i][5] = rng >> 8;
1361		}
1362	}
1363	vm->vm_kernel = -1;
1364	vm->vm_cdrom = -1;
1365	vm->vm_iev.ibuf.fd = -1;
1366
1367	/*
1368	 * Assign a new internal Id if not specified and we succeed in
1369	 * claiming a new Id.
1370	 */
1371	if (id != 0)
1372		vm->vm_vmid = id;
1373	else if (vm_claimid(vcp->vcp_name, uid, &nid) == -1)
1374		goto fail;
1375	else
1376		vm->vm_vmid = nid;
1377
1378	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1379	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1380
1381	*ret_vm = vm;
1382	return (0);
1383 fail:
1384	if (errno == 0)
1385		errno = EINVAL;
1386	return (-1);
1387}
1388
1389int
1390vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1391    struct vmop_create_params *vmc, uid_t uid)
1392{
1393	char			*name;
1394	struct vm_create_params	*vcp = &vmc->vmc_params;
1395	struct vmop_create_params *vmcp;
1396	struct vm_create_params	*vcpp;
1397	struct vmd_vm		*vm = NULL;
1398	unsigned int		 i, j;
1399	uint32_t		 id;
1400
1401	/* return without error if the parent is NULL (nothing to inherit) */
1402	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1403	    vmc->vmc_instance[0] == '\0')
1404		return (0);
1405
1406	if ((*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL) {
1407		errno = VMD_PARENT_INVALID;
1408		return (-1);
1409	}
1410
1411	errno = 0;
1412	vmcp = &(*vm_parent)->vm_params;
1413	vcpp = &vmcp->vmc_params;
1414
1415	/* Are we allowed to create an instance from this VM? */
1416	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1417		log_warnx("vm \"%s\" no permission to create vm instance",
1418		    vcpp->vcp_name);
1419		errno = ENAMETOOLONG;
1420		return (-1);
1421	}
1422
1423	id = vcp->vcp_id;
1424	name = vcp->vcp_name;
1425
1426	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1427	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1428		errno = EPROCLIM;
1429		return (-1);
1430	}
1431
1432	/* CPU */
1433	if (vcp->vcp_ncpus == 0)
1434		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1435	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1436	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1437		log_warnx("vm \"%s\" no permission to set cpus", name);
1438		errno = EPERM;
1439		return (-1);
1440	}
1441
1442	/* memory */
1443	if (vcp->vcp_memranges[0].vmr_size == 0)
1444		vcp->vcp_memranges[0].vmr_size =
1445		    vcpp->vcp_memranges[0].vmr_size;
1446	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1447	    vcp->vcp_memranges[0].vmr_size !=
1448	    vcpp->vcp_memranges[0].vmr_size) {
1449		log_warnx("vm \"%s\" no permission to set memory", name);
1450		errno = EPERM;
1451		return (-1);
1452	}
1453
1454	/* disks cannot be inherited */
1455	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1456	    vcp->vcp_ndisks) {
1457		log_warnx("vm \"%s\" no permission to set disks", name);
1458		errno = EPERM;
1459		return (-1);
1460	}
1461	for (i = 0; i < vcp->vcp_ndisks; i++) {
1462		/* Check if this disk is already used in the parent */
1463		for (j = 0; j < vcpp->vcp_ndisks; j++) {
1464			if (strcmp(vcp->vcp_disks[i],
1465			    vcpp->vcp_disks[j]) == 0) {
1466				log_warnx("vm \"%s\" disk %s cannot be reused",
1467				    name, vcp->vcp_disks[i]);
1468				errno = EBUSY;
1469				return (-1);
1470			}
1471		}
1472		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1473	}
1474
1475	/* interfaces */
1476	if (vcp->vcp_nnics > 0 &&
1477	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1478	    vcp->vcp_nnics != vcpp->vcp_nnics) {
1479		log_warnx("vm \"%s\" no permission to set interfaces", name);
1480		errno = EPERM;
1481		return (-1);
1482	}
1483	for (i = 0; i < vcpp->vcp_nnics; i++) {
1484		/* Interface got overwritten */
1485		if (i < vcp->vcp_nnics)
1486			continue;
1487
1488		/* Copy interface from parent */
1489		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1490		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1491		    sizeof(vmc->vmc_ifnames[i]));
1492		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1493		    sizeof(vmc->vmc_ifswitch[i]));
1494		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1495		    sizeof(vmc->vmc_ifgroup[i]));
1496		memcpy(vcp->vcp_macs[i], vcpp->vcp_macs[i],
1497		    sizeof(vcp->vcp_macs[i]));
1498		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1499		vcp->vcp_nnics++;
1500	}
1501	for (i = 0; i < vcp->vcp_nnics; i++) {
1502		for (j = 0; j < vcpp->vcp_nnics; j++) {
1503			if (memcmp(zero_mac, vcp->vcp_macs[i],
1504			    sizeof(vcp->vcp_macs[i])) != 0 &&
1505			    memcmp(vcpp->vcp_macs[i], vcp->vcp_macs[i],
1506			    sizeof(vcp->vcp_macs[i])) != 0) {
1507				log_warnx("vm \"%s\" lladdr cannot be reused",
1508				    name);
1509				errno = EBUSY;
1510				return (-1);
1511			}
1512			if (strlen(vmc->vmc_ifnames[i]) &&
1513			    strcmp(vmc->vmc_ifnames[i],
1514			    vmcp->vmc_ifnames[j]) == 0) {
1515				log_warnx("vm \"%s\" %s cannot be reused",
1516				    vmc->vmc_ifnames[i], name);
1517				errno = EBUSY;
1518				return (-1);
1519			}
1520		}
1521	}
1522
1523	/* kernel */
1524	if (strlen(vcp->vcp_kernel) > 0) {
1525		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1526			log_warnx("vm \"%s\" no permission to set boot image",
1527			    name);
1528			errno = EPERM;
1529			return (-1);
1530		}
1531		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1532	} else if (strlcpy(vcp->vcp_kernel, vcpp->vcp_kernel,
1533	    sizeof(vcp->vcp_kernel)) >= sizeof(vcp->vcp_kernel)) {
1534		log_warnx("vm \"%s\" kernel name too long", name);
1535		errno = EINVAL;
1536		return (-1);
1537	}
1538
1539	/* cdrom */
1540	if (strlen(vcp->vcp_cdrom) > 0) {
1541		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1542			log_warnx("vm \"%s\" no permission to set cdrom", name);
1543			errno = EPERM;
1544			return (-1);
1545		}
1546		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1547	} else if (strlcpy(vcp->vcp_cdrom, vcpp->vcp_cdrom,
1548	    sizeof(vcp->vcp_cdrom)) >= sizeof(vcp->vcp_cdrom)) {
1549		log_warnx("vm \"%s\" cdrom name too long", name);
1550		errno = EINVAL;
1551		return (-1);
1552	}
1553
1554	/* user */
1555	if (vmc->vmc_owner.uid == 0)
1556		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1557	else if (vmc->vmc_owner.uid != uid &&
1558	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1559		log_warnx("vm \"%s\" user mismatch", name);
1560		errno = EPERM;
1561		return (-1);
1562	}
1563
1564	/* group */
1565	if (vmc->vmc_owner.gid == 0)
1566		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1567	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1568		log_warnx("vm \"%s\" group mismatch", name);
1569		errno = EPERM;
1570		return (-1);
1571	}
1572
1573	/* child instances */
1574	if (vmc->vmc_insflags) {
1575		log_warnx("vm \"%s\" cannot change instance permissions", name);
1576		errno = EPERM;
1577		return (-1);
1578	}
1579	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1580		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1581		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1582		vmc->vmc_insflags = vmcp->vmc_insflags;
1583	} else {
1584		vmc->vmc_insowner.gid = 0;
1585		vmc->vmc_insowner.uid = 0;
1586		vmc->vmc_insflags = 0;
1587	}
1588
1589	/* finished, remove instance flags */
1590	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1591
1592	return (0);
1593}
1594
1595/*
1596 * vm_checkperm
1597 *
1598 * Checks if the user represented by the 'uid' parameter is allowed to
1599 * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1600 * console.)
1601 *
1602 * Parameters:
1603 *  vm: the VM whose permission is to be checked
1604 *  vmo: the required uid/gid to be checked
1605 *  uid: the user ID of the user making the request
1606 *
1607 * Return values:
1608 *   0: the permission should be granted
1609 *  -1: the permission check failed (also returned if vm == null)
1610 */
1611int
1612vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1613{
1614	struct group	*gr;
1615	struct passwd	*pw;
1616	char		**grmem;
1617
1618	/* root has no restrictions */
1619	if (uid == 0)
1620		return (0);
1621
1622	if (vmo == NULL)
1623		return (-1);
1624
1625	/* check user */
1626	if (vm == NULL) {
1627		if  (vmo->uid == uid)
1628			return (0);
1629	} else {
1630		/*
1631		 * check user of running vm (the owner of a running vm can
1632		 * be different to (or more specific than) the configured owner.
1633		 */
1634		if (((vm->vm_state & VM_STATE_RUNNING) && vm->vm_uid == uid) ||
1635		    (!(vm->vm_state & VM_STATE_RUNNING) && vmo->uid == uid))
1636			return (0);
1637	}
1638
1639	/* check groups */
1640	if (vmo->gid != -1) {
1641		if ((pw = getpwuid(uid)) == NULL)
1642			return (-1);
1643		if (pw->pw_gid == vmo->gid)
1644			return (0);
1645		if ((gr = getgrgid(vmo->gid)) != NULL) {
1646			for (grmem = gr->gr_mem; *grmem; grmem++)
1647				if (strcmp(*grmem, pw->pw_name) == 0)
1648					return (0);
1649		}
1650	}
1651
1652	return (-1);
1653}
1654
1655/*
1656 * vm_checkinsflag
1657 *
1658 * Checks wheter the non-root user is allowed to set an instance option.
1659 *
1660 * Parameters:
1661 *  vmc: the VM create parameters
1662 *  flag: the flag to be checked
1663 *  uid: the user ID of the user making the request
1664 *
1665 * Return values:
1666 *   0: the permission should be granted
1667 *  -1: the permission check failed (also returned if vm == null)
1668 */
1669int
1670vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1671{
1672	/* root has no restrictions */
1673	if (uid == 0)
1674		return (0);
1675
1676	if ((vmc->vmc_insflags & flag) == 0)
1677		return (-1);
1678
1679	return (0);
1680}
1681
1682/*
1683 * vm_checkaccess
1684 *
1685 * Checks if the user represented by the 'uid' parameter is allowed to
1686 * access the file described by the 'path' parameter.
1687 *
1688 * Parameters:
1689 *  fd: the file descriptor of the opened file
1690 *  uflag: check if the userid has access to the file
1691 *  uid: the user ID of the user making the request
1692 *  amode: the access flags of R_OK and W_OK
1693 *
1694 * Return values:
1695 *   0: the permission should be granted
1696 *  -1: the permission check failed
1697 */
1698int
1699vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1700{
1701	struct group	*gr;
1702	struct passwd	*pw;
1703	char		**grmem;
1704	struct stat	 st;
1705	mode_t		 mode;
1706
1707	if (fd == -1)
1708		return (-1);
1709
1710	/*
1711	 * File has to be accessible and a regular file
1712	 */
1713	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1714		return (-1);
1715
1716	/* root has no restrictions */
1717	if (uid == 0 || uflag == 0)
1718		return (0);
1719
1720	/* check other */
1721	mode = amode & W_OK ? S_IWOTH : 0;
1722	mode |= amode & R_OK ? S_IROTH : 0;
1723	if ((st.st_mode & mode) == mode)
1724		return (0);
1725
1726	/* check user */
1727	mode = amode & W_OK ? S_IWUSR : 0;
1728	mode |= amode & R_OK ? S_IRUSR : 0;
1729	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1730		return (0);
1731
1732	/* check groups */
1733	mode = amode & W_OK ? S_IWGRP : 0;
1734	mode |= amode & R_OK ? S_IRGRP : 0;
1735	if ((st.st_mode & mode) != mode)
1736		return (-1);
1737	if ((pw = getpwuid(uid)) == NULL)
1738		return (-1);
1739	if (pw->pw_gid == st.st_gid)
1740		return (0);
1741	if ((gr = getgrgid(st.st_gid)) != NULL) {
1742		for (grmem = gr->gr_mem; *grmem; grmem++)
1743			if (strcmp(*grmem, pw->pw_name) == 0)
1744				return (0);
1745	}
1746
1747	return (-1);
1748}
1749
1750int
1751vm_opentty(struct vmd_vm *vm)
1752{
1753	struct ptmget		 ptm;
1754	struct stat		 st;
1755	struct group		*gr;
1756	uid_t			 uid;
1757	gid_t			 gid;
1758	mode_t			 mode;
1759	int			 on;
1760
1761	/*
1762	 * Open tty with pre-opened PTM fd
1763	 */
1764	if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1))
1765		return (-1);
1766
1767	/*
1768	 * We use user ioctl(2) mode to pass break commands.
1769	 */
1770	on = 1;
1771	if (ioctl(ptm.cfd, TIOCUCNTL, &on) == -1)
1772		fatal("could not enable user ioctl mode");
1773
1774	vm->vm_tty = ptm.cfd;
1775	close(ptm.sfd);
1776	if ((vm->vm_ttyname = strdup(ptm.sn)) == NULL)
1777		goto fail;
1778
1779	uid = vm->vm_uid;
1780	gid = vm->vm_params.vmc_owner.gid;
1781
1782	if (vm->vm_params.vmc_owner.gid != -1) {
1783		mode = 0660;
1784	} else if ((gr = getgrnam("tty")) != NULL) {
1785		gid = gr->gr_gid;
1786		mode = 0620;
1787	} else {
1788		mode = 0600;
1789		gid = 0;
1790	}
1791
1792	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1793	    __func__, vm->vm_params.vmc_params.vcp_name,
1794	    vm->vm_ttyname, uid, gid, mode);
1795
1796	/*
1797	 * Change ownership and mode of the tty as required.
1798	 * Loosely based on the implementation of sshpty.c
1799	 */
1800	if (stat(vm->vm_ttyname, &st) == -1)
1801		goto fail;
1802
1803	if (st.st_uid != uid || st.st_gid != gid) {
1804		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1805			log_warn("chown %s %d %d failed, uid %d",
1806			    vm->vm_ttyname, uid, gid, getuid());
1807
1808			/* Ignore failure on read-only filesystems */
1809			if (!((errno == EROFS) &&
1810			    (st.st_uid == uid || st.st_uid == 0)))
1811				goto fail;
1812		}
1813	}
1814
1815	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1816		if (chmod(vm->vm_ttyname, mode) == -1) {
1817			log_warn("chmod %s %o failed, uid %d",
1818			    vm->vm_ttyname, mode, getuid());
1819
1820			/* Ignore failure on read-only filesystems */
1821			if (!((errno == EROFS) &&
1822			    (st.st_uid == uid || st.st_uid == 0)))
1823				goto fail;
1824		}
1825	}
1826
1827	return (0);
1828 fail:
1829	vm_closetty(vm);
1830	return (-1);
1831}
1832
1833void
1834vm_closetty(struct vmd_vm *vm)
1835{
1836	if (vm->vm_tty != -1) {
1837		/* Release and close the tty */
1838		if (fchown(vm->vm_tty, 0, 0) == -1)
1839			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1840		if (fchmod(vm->vm_tty, 0666) == -1)
1841			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1842		close(vm->vm_tty);
1843		vm->vm_tty = -1;
1844	}
1845	free(vm->vm_ttyname);
1846	vm->vm_ttyname = NULL;
1847}
1848
1849void
1850switch_remove(struct vmd_switch *vsw)
1851{
1852	if (vsw == NULL)
1853		return;
1854
1855	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1856
1857	free(vsw->sw_group);
1858	free(vsw->sw_name);
1859	free(vsw);
1860}
1861
1862struct vmd_switch *
1863switch_getbyname(const char *name)
1864{
1865	struct vmd_switch	*vsw;
1866
1867	if (name == NULL)
1868		return (NULL);
1869	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1870		if (strcmp(vsw->sw_name, name) == 0)
1871			return (vsw);
1872	}
1873
1874	return (NULL);
1875}
1876
1877struct vmd_user *
1878user_get(uid_t uid)
1879{
1880	struct vmd_user		*usr;
1881
1882	if (uid == 0)
1883		return (NULL);
1884
1885	/* first try to find an existing user */
1886	TAILQ_FOREACH(usr, env->vmd_users, usr_entry) {
1887		if (usr->usr_id.uid == uid)
1888			goto done;
1889	}
1890
1891	if ((usr = calloc(1, sizeof(*usr))) == NULL) {
1892		log_warn("could not allocate user");
1893		return (NULL);
1894	}
1895
1896	usr->usr_id.uid = uid;
1897	usr->usr_id.gid = -1;
1898	TAILQ_INSERT_TAIL(env->vmd_users, usr, usr_entry);
1899
1900 done:
1901	DPRINTF("%s: uid %d #%d +",
1902	    __func__, usr->usr_id.uid, usr->usr_refcnt + 1);
1903	usr->usr_refcnt++;
1904
1905	return (usr);
1906}
1907
1908void
1909user_put(struct vmd_user *usr)
1910{
1911	if (usr == NULL)
1912		return;
1913
1914	DPRINTF("%s: uid %d #%d -",
1915	    __func__, usr->usr_id.uid, usr->usr_refcnt - 1);
1916
1917	if (--usr->usr_refcnt > 0)
1918		return;
1919
1920	TAILQ_REMOVE(env->vmd_users, usr, usr_entry);
1921	free(usr);
1922}
1923
1924void
1925user_inc(struct vm_create_params *vcp, struct vmd_user *usr, int inc)
1926{
1927	char	 mem[FMT_SCALED_STRSIZE];
1928
1929	if (usr == NULL)
1930		return;
1931
1932	/* increment or decrement counters */
1933	inc = inc ? 1 : -1;
1934
1935	usr->usr_maxcpu += vcp->vcp_ncpus * inc;
1936	usr->usr_maxmem += vcp->vcp_memranges[0].vmr_size * inc;
1937	usr->usr_maxifs += vcp->vcp_nnics * inc;
1938
1939	if (log_getverbose() > 1) {
1940		(void)fmt_scaled(usr->usr_maxmem * 1024 * 1024, mem);
1941		log_debug("%s: %c uid %d ref %d cpu %llu mem %s ifs %llu",
1942		    __func__, inc == 1 ? '+' : '-',
1943		    usr->usr_id.uid, usr->usr_refcnt,
1944		    usr->usr_maxcpu, mem, usr->usr_maxifs);
1945	}
1946}
1947
1948int
1949user_checklimit(struct vmd_user *usr, struct vm_create_params *vcp)
1950{
1951	const char	*limit = "";
1952
1953	/* XXX make the limits configurable */
1954	if (usr->usr_maxcpu > VM_DEFAULT_USER_MAXCPU) {
1955		limit = "cpu ";
1956		goto fail;
1957	}
1958	if (usr->usr_maxmem > VM_DEFAULT_USER_MAXMEM) {
1959		limit = "memory ";
1960		goto fail;
1961	}
1962	if (usr->usr_maxifs > VM_DEFAULT_USER_MAXIFS) {
1963		limit = "interface ";
1964		goto fail;
1965	}
1966
1967	return (0);
1968
1969 fail:
1970	log_warnx("%s: user %d %slimit reached", vcp->vcp_name,
1971	    usr->usr_id.uid, limit);
1972	return (-1);
1973}
1974
1975char *
1976get_string(uint8_t *ptr, size_t len)
1977{
1978	size_t	 i;
1979
1980	for (i = 0; i < len; i++)
1981		if (!isprint(ptr[i]))
1982			break;
1983
1984	return strndup(ptr, i);
1985}
1986
1987uint32_t
1988prefixlen2mask(uint8_t prefixlen)
1989{
1990	if (prefixlen == 0)
1991		return (0);
1992
1993	if (prefixlen > 32)
1994		prefixlen = 32;
1995
1996	return (htonl(0xffffffff << (32 - prefixlen)));
1997}
1998
1999void
2000prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask)
2001{
2002	struct in6_addr	 s6;
2003	int		 i;
2004
2005	if (prefixlen > 128)
2006		prefixlen = 128;
2007
2008	memset(&s6, 0, sizeof(s6));
2009	for (i = 0; i < prefixlen / 8; i++)
2010		s6.s6_addr[i] = 0xff;
2011	i = prefixlen % 8;
2012	if (i)
2013		s6.s6_addr[prefixlen / 8] = 0xff00 >> i;
2014
2015	memcpy(mask, &s6, sizeof(s6));
2016}
2017
2018void
2019getmonotime(struct timeval *tv)
2020{
2021	struct timespec	 ts;
2022
2023	if (clock_gettime(CLOCK_MONOTONIC, &ts))
2024		fatal("clock_gettime");
2025
2026	TIMESPEC_TO_TIMEVAL(tv, &ts);
2027}
2028