vmd.c revision 1.128
1/*	$OpenBSD: vmd.c,v 1.128 2021/12/13 18:28:40 deraadt Exp $	*/
2
3/*
4 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/types.h>
20#include <sys/queue.h>
21#include <sys/wait.h>
22#include <sys/stat.h>
23#include <sys/sysctl.h>
24#include <sys/tty.h>
25#include <sys/ttycom.h>
26#include <sys/ioctl.h>
27
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31#include <termios.h>
32#include <errno.h>
33#include <event.h>
34#include <fcntl.h>
35#include <pwd.h>
36#include <signal.h>
37#include <syslog.h>
38#include <unistd.h>
39#include <util.h>
40#include <ctype.h>
41#include <pwd.h>
42#include <grp.h>
43
44#include <machine/specialreg.h>
45#include <machine/vmmvar.h>
46
47#include "proc.h"
48#include "atomicio.h"
49#include "vmd.h"
50
51__dead void usage(void);
52
53int	 main(int, char **);
54int	 vmd_configure(void);
55void	 vmd_sighdlr(int sig, short event, void *arg);
56void	 vmd_shutdown(void);
57int	 vmd_control_run(void);
58int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
59int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
60int	 vmd_dispatch_priv(int, struct privsep_proc *, struct imsg *);
61int	 vmd_check_vmh(struct vm_dump_header *);
62
63int	 vm_instance(struct privsep *, struct vmd_vm **,
64	    struct vmop_create_params *, uid_t);
65int	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
66int	 vm_claimid(const char *, int, uint32_t *);
67void	 start_vm_batch(int, short, void*);
68
69struct vmd	*env;
70
71static struct privsep_proc procs[] = {
72	/* Keep "priv" on top as procs[0] */
73	{ "priv",	PROC_PRIV,	vmd_dispatch_priv, priv },
74	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
75	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm, vmm_shutdown },
76};
77
78enum privsep_procid privsep_process;
79
80struct event staggered_start_timer;
81
82/* For the privileged process */
83static struct privsep_proc *proc_priv = &procs[0];
84static struct passwd proc_privpw;
85static const uint8_t zero_mac[ETHER_ADDR_LEN];
86
87int
88vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
89{
90	struct privsep			*ps = p->p_ps;
91	int				 res = 0, ret = 0, cmd = 0, verbose;
92	unsigned int			 v = 0, flags;
93	struct vmop_create_params	 vmc;
94	struct vmop_id			 vid;
95	struct vmop_result		 vmr;
96	struct vm_dump_header		 vmh;
97	struct vmd_vm			*vm = NULL;
98	char				*str = NULL;
99	uint32_t			 id = 0;
100	struct control_sock		*rcs;
101
102	switch (imsg->hdr.type) {
103	case IMSG_VMDOP_START_VM_REQUEST:
104		IMSG_SIZE_CHECK(imsg, &vmc);
105		memcpy(&vmc, imsg->data, sizeof(vmc));
106		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
107		if (vmc.vmc_flags == 0) {
108			/* start an existing VM with pre-configured options */
109			if (!(ret == -1 && errno == EALREADY &&
110			    !(vm->vm_state & VM_STATE_RUNNING))) {
111				res = errno;
112				cmd = IMSG_VMDOP_START_VM_RESPONSE;
113			}
114		} else if (ret != 0) {
115			res = errno;
116			cmd = IMSG_VMDOP_START_VM_RESPONSE;
117		}
118		if (res == 0) {
119			res = config_setvm(ps, vm, imsg->hdr.peerid,
120			    vm->vm_params.vmc_owner.uid);
121			if (res)
122				cmd = IMSG_VMDOP_START_VM_RESPONSE;
123		}
124		break;
125	case IMSG_VMDOP_WAIT_VM_REQUEST:
126	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
127		IMSG_SIZE_CHECK(imsg, &vid);
128		memcpy(&vid, imsg->data, sizeof(vid));
129		flags = vid.vid_flags;
130		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
131
132		if ((id = vid.vid_id) == 0) {
133			/* Lookup vm (id) by name */
134			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
135				res = ENOENT;
136				break;
137			} else if ((vm->vm_state & VM_STATE_SHUTDOWN) &&
138			    (flags & VMOP_FORCE) == 0) {
139				res = EALREADY;
140				break;
141			} else if (!(vm->vm_state & VM_STATE_RUNNING)) {
142				res = EINVAL;
143				break;
144			}
145			id = vm->vm_vmid;
146		} else if ((vm = vm_getbyvmid(id)) == NULL) {
147			res = ENOENT;
148			break;
149		}
150		if (vm_checkperm(vm, &vm->vm_params.vmc_owner, vid.vid_uid)) {
151			res = EPERM;
152			break;
153		}
154
155		/* Only relay TERMINATION requests, not WAIT requests */
156		if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_REQUEST) {
157			memset(&vid, 0, sizeof(vid));
158			vid.vid_id = id;
159			vid.vid_flags = flags;
160
161			if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
162				imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
163				return (-1);
164		}
165		break;
166	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
167		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
168		break;
169	case IMSG_VMDOP_LOAD:
170		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
171		str = get_string((uint8_t *)imsg->data,
172		    IMSG_DATA_SIZE(imsg));
173	case IMSG_VMDOP_RELOAD:
174		if (vmd_reload(0, str) == -1)
175			cmd = IMSG_CTL_FAIL;
176		else
177			cmd = IMSG_CTL_OK;
178		free(str);
179		break;
180	case IMSG_CTL_RESET:
181		IMSG_SIZE_CHECK(imsg, &v);
182		memcpy(&v, imsg->data, sizeof(v));
183		if (vmd_reload(v, NULL) == -1)
184			cmd = IMSG_CTL_FAIL;
185		else
186			cmd = IMSG_CTL_OK;
187		break;
188	case IMSG_CTL_VERBOSE:
189		IMSG_SIZE_CHECK(imsg, &verbose);
190		memcpy(&verbose, imsg->data, sizeof(verbose));
191		log_setverbose(verbose);
192
193		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
194		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
195		cmd = IMSG_CTL_OK;
196		break;
197	case IMSG_VMDOP_PAUSE_VM:
198	case IMSG_VMDOP_UNPAUSE_VM:
199		IMSG_SIZE_CHECK(imsg, &vid);
200		memcpy(&vid, imsg->data, sizeof(vid));
201		if (vid.vid_id == 0) {
202			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
203				res = ENOENT;
204				cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
205				    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
206				    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
207				break;
208			} else {
209				vid.vid_id = vm->vm_vmid;
210			}
211		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
212			res = ENOENT;
213			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
214			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
215			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
216			break;
217		}
218		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
219		    vid.vid_uid) != 0) {
220			res = EPERM;
221			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
222			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
223			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
224			break;
225		}
226		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
227		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
228		break;
229	case IMSG_VMDOP_SEND_VM_REQUEST:
230		IMSG_SIZE_CHECK(imsg, &vid);
231		memcpy(&vid, imsg->data, sizeof(vid));
232		id = vid.vid_id;
233		if (vid.vid_id == 0) {
234			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
235				res = ENOENT;
236				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
237				close(imsg->fd);
238				break;
239			} else {
240				vid.vid_id = vm->vm_vmid;
241			}
242		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
243			res = ENOENT;
244			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
245			close(imsg->fd);
246			break;
247		}
248		vmr.vmr_id = vid.vid_id;
249		log_debug("%s: sending fd to vmm", __func__);
250		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
251		    imsg->hdr.peerid, imsg->fd, &vid, sizeof(vid));
252		break;
253	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
254		IMSG_SIZE_CHECK(imsg, &vid);
255		memcpy(&vid, imsg->data, sizeof(vid));
256		if (imsg->fd == -1) {
257			log_warnx("%s: invalid fd", __func__);
258			return (-1);
259		}
260		if (atomicio(read, imsg->fd, &vmh, sizeof(vmh)) !=
261		    sizeof(vmh)) {
262			log_warnx("%s: error reading vmh from received vm",
263			    __func__);
264			res = EIO;
265			close(imsg->fd);
266			cmd = IMSG_VMDOP_START_VM_RESPONSE;
267			break;
268		}
269
270		if (vmd_check_vmh(&vmh)) {
271			res = ENOENT;
272			close(imsg->fd);
273			cmd = IMSG_VMDOP_START_VM_RESPONSE;
274			break;
275		}
276		if (atomicio(read, imsg->fd, &vmc, sizeof(vmc)) !=
277		    sizeof(vmc)) {
278			log_warnx("%s: error reading vmc from received vm",
279			    __func__);
280			res = EIO;
281			close(imsg->fd);
282			cmd = IMSG_VMDOP_START_VM_RESPONSE;
283			break;
284		}
285		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
286		    sizeof(vmc.vmc_params.vcp_name));
287		vmc.vmc_params.vcp_id = 0;
288
289		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
290		if (ret != 0) {
291			res = errno;
292			cmd = IMSG_VMDOP_START_VM_RESPONSE;
293			close(imsg->fd);
294		} else {
295			vm->vm_state |= VM_STATE_RECEIVED;
296			config_setvm(ps, vm, imsg->hdr.peerid,
297			    vmc.vmc_owner.uid);
298			log_debug("%s: sending fd to vmm", __func__);
299			proc_compose_imsg(ps, PROC_VMM, -1,
300			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, imsg->fd,
301			    NULL, 0);
302		}
303		break;
304	case IMSG_VMDOP_DONE:
305		control_reset(&ps->ps_csock);
306		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
307			control_reset(rcs);
308		cmd = 0;
309		break;
310	default:
311		return (-1);
312	}
313
314	switch (cmd) {
315	case 0:
316		break;
317	case IMSG_VMDOP_START_VM_RESPONSE:
318	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
319		memset(&vmr, 0, sizeof(vmr));
320		vmr.vmr_result = res;
321		vmr.vmr_id = id;
322		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
323		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
324			return (-1);
325		break;
326	default:
327		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
328		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
329			return (-1);
330		break;
331	}
332
333	return (0);
334}
335
336int
337vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
338{
339	struct vmop_result	 vmr;
340	struct privsep		*ps = p->p_ps;
341	int			 res = 0;
342	struct vmd_vm		*vm;
343	struct vm_create_params	*vcp;
344	struct vmop_info_result	 vir;
345
346	switch (imsg->hdr.type) {
347	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
348		IMSG_SIZE_CHECK(imsg, &vmr);
349		memcpy(&vmr, imsg->data, sizeof(vmr));
350		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
351			break;
352		proc_compose_imsg(ps, PROC_CONTROL, -1,
353		    imsg->hdr.type, imsg->hdr.peerid, -1,
354		    imsg->data, sizeof(imsg->data));
355		log_info("%s: paused vm %d successfully",
356		    vm->vm_params.vmc_params.vcp_name,
357		    vm->vm_vmid);
358		vm->vm_state |= VM_STATE_PAUSED;
359		break;
360	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
361		IMSG_SIZE_CHECK(imsg, &vmr);
362		memcpy(&vmr, imsg->data, sizeof(vmr));
363		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
364			break;
365		proc_compose_imsg(ps, PROC_CONTROL, -1,
366		    imsg->hdr.type, imsg->hdr.peerid, -1,
367		    imsg->data, sizeof(imsg->data));
368		log_info("%s: unpaused vm %d successfully.",
369		    vm->vm_params.vmc_params.vcp_name,
370		    vm->vm_vmid);
371		vm->vm_state &= ~VM_STATE_PAUSED;
372		break;
373	case IMSG_VMDOP_START_VM_RESPONSE:
374		IMSG_SIZE_CHECK(imsg, &vmr);
375		memcpy(&vmr, imsg->data, sizeof(vmr));
376		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
377			break;
378		vm->vm_pid = vmr.vmr_pid;
379		vcp = &vm->vm_params.vmc_params;
380		vcp->vcp_id = vmr.vmr_id;
381
382		/*
383		 * If the peerid is not -1, forward the response back to the
384		 * the control socket.  If it is -1, the request originated
385		 * from the parent, not the control socket.
386		 */
387		if (vm->vm_peerid != (uint32_t)-1) {
388			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
389			    sizeof(vmr.vmr_ttyname));
390			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
391			    imsg->hdr.type, vm->vm_peerid, -1,
392			    &vmr, sizeof(vmr)) == -1) {
393				errno = vmr.vmr_result;
394				log_warn("%s: failed to foward vm result",
395				    vcp->vcp_name);
396				vm_remove(vm, __func__);
397				return (-1);
398			}
399		}
400
401		if (vmr.vmr_result) {
402			errno = vmr.vmr_result;
403			log_warn("%s: failed to start vm", vcp->vcp_name);
404			vm_remove(vm, __func__);
405			break;
406		}
407
408		/* Now configure all the interfaces */
409		if (vm_priv_ifconfig(ps, vm) == -1) {
410			log_warn("%s: failed to configure vm", vcp->vcp_name);
411			vm_remove(vm, __func__);
412			break;
413		}
414
415		log_info("%s: started vm %d successfully, tty %s",
416		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
417		break;
418	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
419		IMSG_SIZE_CHECK(imsg, &vmr);
420		memcpy(&vmr, imsg->data, sizeof(vmr));
421
422		if (vmr.vmr_result) {
423			DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
424			    __func__, vmr.vmr_id);
425			proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
426		} else {
427			if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
428				break;
429			/* Mark VM as shutting down */
430			vm->vm_state |= VM_STATE_SHUTDOWN;
431		}
432		break;
433	case IMSG_VMDOP_SEND_VM_RESPONSE:
434		IMSG_SIZE_CHECK(imsg, &vmr);
435		memcpy(&vmr, imsg->data, sizeof(vmr));
436		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
437			break;
438		if (!vmr.vmr_result) {
439			log_info("%s: sent vm %d successfully.",
440			    vm->vm_params.vmc_params.vcp_name,
441			    vm->vm_vmid);
442			if (vm->vm_from_config)
443				vm_stop(vm, 0, __func__);
444			else
445				vm_remove(vm, __func__);
446		}
447
448		/* Send a response if a control client is waiting for it */
449		if (imsg->hdr.peerid != (uint32_t)-1) {
450			/* the error is meaningless for deferred responses */
451			vmr.vmr_result = 0;
452
453			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
454			    IMSG_VMDOP_SEND_VM_RESPONSE,
455			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
456				return (-1);
457		}
458		break;
459	case IMSG_VMDOP_TERMINATE_VM_EVENT:
460		IMSG_SIZE_CHECK(imsg, &vmr);
461		memcpy(&vmr, imsg->data, sizeof(vmr));
462		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
463		    __func__, vmr.vmr_id, vmr.vmr_result);
464		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
465			log_debug("%s: vm %d is no longer available",
466			    __func__, vmr.vmr_id);
467			break;
468		}
469		if (vmr.vmr_result != EAGAIN ||
470		    vm->vm_params.vmc_bootdevice) {
471			if (vm->vm_from_config)
472				vm_stop(vm, 0, __func__);
473			else
474				vm_remove(vm, __func__);
475		} else {
476			/* Stop VM instance but keep the tty open */
477			vm_stop(vm, 1, __func__);
478			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
479		}
480
481		/* The error is meaningless for deferred responses */
482		vmr.vmr_result = 0;
483
484		if (proc_compose_imsg(ps, PROC_CONTROL, -1,
485			IMSG_VMDOP_TERMINATE_VM_EVENT,
486			imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
487			return (-1);
488		break;
489	case IMSG_VMDOP_GET_INFO_VM_DATA:
490		IMSG_SIZE_CHECK(imsg, &vir);
491		memcpy(&vir, imsg->data, sizeof(vir));
492		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
493			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
494			if (vm->vm_ttyname != NULL)
495				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
496				    sizeof(vir.vir_ttyname));
497			log_debug("%s: running vm: %d, vm_state: 0x%x",
498			    __func__, vm->vm_vmid, vm->vm_state);
499			vir.vir_state = vm->vm_state;
500			/* get the user id who started the vm */
501			vir.vir_uid = vm->vm_uid;
502			vir.vir_gid = vm->vm_params.vmc_owner.gid;
503		}
504		if (proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type,
505		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
506			log_debug("%s: GET_INFO_VM failed for vm %d, removing",
507			    __func__, vm->vm_vmid);
508			vm_remove(vm, __func__);
509			return (-1);
510		}
511		break;
512	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
513		/*
514		 * PROC_VMM has responded with the *running* VMs, now we
515		 * append the others. These use the special value 0 for their
516		 * kernel id to indicate that they are not running.
517		 */
518		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
519			if (!(vm->vm_state & VM_STATE_RUNNING)) {
520				memset(&vir, 0, sizeof(vir));
521				vir.vir_info.vir_id = vm->vm_vmid;
522				strlcpy(vir.vir_info.vir_name,
523				    vm->vm_params.vmc_params.vcp_name,
524				    VMM_MAX_NAME_LEN);
525				vir.vir_info.vir_memory_size =
526				    vm->vm_params.vmc_params.
527				    vcp_memranges[0].vmr_size;
528				vir.vir_info.vir_ncpus =
529				    vm->vm_params.vmc_params.vcp_ncpus;
530				/* get the configured user id for this vm */
531				vir.vir_uid = vm->vm_params.vmc_owner.uid;
532				vir.vir_gid = vm->vm_params.vmc_owner.gid;
533				log_debug("%s: vm: %d, vm_state: 0x%x",
534				    __func__, vm->vm_vmid, vm->vm_state);
535				vir.vir_state = vm->vm_state;
536				if (proc_compose_imsg(ps, PROC_CONTROL, -1,
537				    IMSG_VMDOP_GET_INFO_VM_DATA,
538				    imsg->hdr.peerid, -1, &vir,
539				    sizeof(vir)) == -1) {
540					log_debug("%s: GET_INFO_VM_END failed",
541					    __func__);
542					vm_remove(vm, __func__);
543					return (-1);
544				}
545			}
546		}
547		IMSG_SIZE_CHECK(imsg, &res);
548		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
549		break;
550	default:
551		return (-1);
552	}
553
554	return (0);
555}
556
557int
558vmd_dispatch_priv(int fd, struct privsep_proc *p, struct imsg *imsg)
559{
560	struct vmop_addr_result	 var;
561
562	switch (imsg->hdr.type) {
563	case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
564		IMSG_SIZE_CHECK(imsg, &var);
565		memcpy(&var, imsg->data, sizeof(var));
566		proc_forward_imsg(p->p_ps, imsg, PROC_VMM, -1);
567		break;
568	default:
569		return (-1);
570	}
571
572	return (0);
573}
574
575int
576vmd_check_vmh(struct vm_dump_header *vmh)
577{
578	int i;
579	unsigned int code, leaf;
580	unsigned int a, b, c, d;
581
582	if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
583		log_warnx("%s: incompatible dump signature", __func__);
584		return (-1);
585	}
586
587	if (vmh->vmh_version != VM_DUMP_VERSION) {
588		log_warnx("%s: incompatible dump version", __func__);
589		return (-1);
590	}
591
592	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
593		code = vmh->vmh_cpuids[i].code;
594		leaf = vmh->vmh_cpuids[i].leaf;
595		if (leaf != 0x00) {
596			log_debug("%s: invalid leaf 0x%x for code 0x%x",
597			    __func__, leaf, code);
598			return (-1);
599		}
600
601		switch (code) {
602		case 0x00:
603			CPUID_LEAF(code, leaf, a, b, c, d);
604			if (vmh->vmh_cpuids[i].a > a) {
605				log_debug("%s: incompatible cpuid level",
606				    __func__);
607				return (-1);
608			}
609			if (!(vmh->vmh_cpuids[i].b == b &&
610			    vmh->vmh_cpuids[i].c == c &&
611			    vmh->vmh_cpuids[i].d == d)) {
612				log_debug("%s: incompatible cpu brand",
613				    __func__);
614				return (-1);
615			}
616			break;
617
618		case 0x01:
619			CPUID_LEAF(code, leaf, a, b, c, d);
620			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
621			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
622				log_debug("%s: incompatible cpu features "
623				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
624				    code, leaf);
625				return (-1);
626			}
627			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
628			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
629				log_debug("%s: incompatible cpu features "
630				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
631				    code, leaf);
632				return (-1);
633			}
634			break;
635
636		case 0x07:
637			CPUID_LEAF(code, leaf, a, b, c, d);
638			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
639			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
640				log_debug("%s: incompatible cpu features "
641				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
642				    code, leaf);
643				return (-1);
644			}
645			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
646			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
647				log_debug("%s: incompatible cpu features "
648				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
649				    code, leaf);
650				return (-1);
651			}
652			break;
653
654		case 0x0d:
655			CPUID_LEAF(code, leaf, a, b, c, d);
656			if (vmh->vmh_cpuids[i].b > b) {
657				log_debug("%s: incompatible cpu: insufficient "
658				    "max save area for enabled XCR0 features",
659				    __func__);
660				return (-1);
661			}
662			if (vmh->vmh_cpuids[i].c > c) {
663				log_debug("%s: incompatible cpu: insufficient "
664				    "max save area for supported XCR0 features",
665				    __func__);
666				return (-1);
667			}
668			break;
669
670		case 0x80000001:
671			CPUID_LEAF(code, leaf, a, b, c, d);
672			if ((vmh->vmh_cpuids[i].a & a) !=
673			    vmh->vmh_cpuids[i].a) {
674				log_debug("%s: incompatible cpu features "
675				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
676				    code, leaf);
677				return (-1);
678			}
679			if ((vmh->vmh_cpuids[i].c & c) !=
680			    vmh->vmh_cpuids[i].c) {
681				log_debug("%s: incompatible cpu features "
682				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
683				    code, leaf);
684				return (-1);
685			}
686			if ((vmh->vmh_cpuids[i].d & d) !=
687			    vmh->vmh_cpuids[i].d) {
688				log_debug("%s: incompatible cpu features "
689				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
690				    code, leaf);
691				return (-1);
692			}
693			break;
694
695		default:
696			log_debug("%s: unknown code 0x%x", __func__, code);
697			return (-1);
698		}
699	}
700
701	return (0);
702}
703
704void
705vmd_sighdlr(int sig, short event, void *arg)
706{
707	if (privsep_process != PROC_PARENT)
708		return;
709	log_debug("%s: handling signal", __func__);
710
711	switch (sig) {
712	case SIGHUP:
713		log_info("%s: reload requested with SIGHUP", __func__);
714
715		/*
716		 * This is safe because libevent uses async signal handlers
717		 * that run in the event loop and not in signal context.
718		 */
719		(void)vmd_reload(0, NULL);
720		break;
721	case SIGPIPE:
722		log_info("%s: ignoring SIGPIPE", __func__);
723		break;
724	case SIGUSR1:
725		log_info("%s: ignoring SIGUSR1", __func__);
726		break;
727	case SIGTERM:
728	case SIGINT:
729		vmd_shutdown();
730		break;
731	default:
732		fatalx("unexpected signal");
733	}
734}
735
736__dead void
737usage(void)
738{
739	extern char *__progname;
740	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
741	    __progname);
742	exit(1);
743}
744
745int
746main(int argc, char **argv)
747{
748	struct privsep		*ps;
749	int			 ch;
750	const char		*conffile = VMD_CONF;
751	enum privsep_procid	 proc_id = PROC_PARENT;
752	int			 proc_instance = 0;
753	const char		*errp, *title = NULL;
754	int			 argc0 = argc;
755
756	log_init(0, LOG_DAEMON);
757
758	if ((env = calloc(1, sizeof(*env))) == NULL)
759		fatal("calloc: env");
760
761	while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) {
762		switch (ch) {
763		case 'D':
764			if (cmdline_symset(optarg) < 0)
765				log_warnx("could not parse macro definition %s",
766				    optarg);
767			break;
768		case 'd':
769			env->vmd_debug = 2;
770			break;
771		case 'f':
772			conffile = optarg;
773			break;
774		case 'v':
775			env->vmd_verbose++;
776			break;
777		case 'n':
778			env->vmd_noaction = 1;
779			break;
780		case 'P':
781			title = optarg;
782			proc_id = proc_getid(procs, nitems(procs), title);
783			if (proc_id == PROC_MAX)
784				fatalx("invalid process name");
785			break;
786		case 'I':
787			proc_instance = strtonum(optarg, 0,
788			    PROC_MAX_INSTANCES, &errp);
789			if (errp)
790				fatalx("invalid process instance");
791			break;
792		default:
793			usage();
794		}
795	}
796
797	argc -= optind;
798	if (argc > 0)
799		usage();
800
801	if (env->vmd_noaction && !env->vmd_debug)
802		env->vmd_debug = 1;
803
804	log_init(env->vmd_debug, LOG_DAEMON);
805	log_setverbose(env->vmd_verbose);
806
807	/* check for root privileges */
808	if (env->vmd_noaction == 0) {
809		if (geteuid())
810			fatalx("need root privileges");
811	}
812
813	ps = &env->vmd_ps;
814	ps->ps_env = env;
815	env->vmd_fd = -1;
816
817	if (config_init(env) == -1)
818		fatal("failed to initialize configuration");
819
820	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
821		fatal("unknown user %s", VMD_USER);
822
823	/* First proc runs as root without pledge but in default chroot */
824	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
825	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
826
827	/* Open /dev/vmm */
828	if (env->vmd_noaction == 0) {
829		env->vmd_fd = open(VMM_NODE, O_RDWR);
830		if (env->vmd_fd == -1)
831			fatal("%s", VMM_NODE);
832	}
833
834	/* Configure the control socket */
835	ps->ps_csock.cs_name = SOCKET_NAME;
836	TAILQ_INIT(&ps->ps_rcsocks);
837
838	/* Configuration will be parsed after forking the children */
839	env->vmd_conffile = conffile;
840
841	if (env->vmd_noaction)
842		ps->ps_noaction = 1;
843	ps->ps_instance = proc_instance;
844	if (title != NULL)
845		ps->ps_title[proc_id] = title;
846
847	/* only the parent returns */
848	proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv,
849	    proc_id);
850
851	log_procinit("parent");
852	if (!env->vmd_debug && daemon(0, 0) == -1)
853		fatal("can't daemonize");
854
855	if (ps->ps_noaction == 0)
856		log_info("startup");
857
858	event_init();
859
860	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
861	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
862	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
863	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
864	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
865
866	signal_add(&ps->ps_evsigint, NULL);
867	signal_add(&ps->ps_evsigterm, NULL);
868	signal_add(&ps->ps_evsighup, NULL);
869	signal_add(&ps->ps_evsigpipe, NULL);
870	signal_add(&ps->ps_evsigusr1, NULL);
871
872	if (!env->vmd_noaction)
873		proc_connect(ps);
874
875	if (vmd_configure() == -1)
876		fatalx("configuration failed");
877
878	event_dispatch();
879
880	log_debug("parent exiting");
881
882	return (0);
883}
884
885void
886start_vm_batch(int fd, short type, void *args)
887{
888	int		i = 0;
889	struct vmd_vm	*vm;
890
891	log_debug("%s: starting batch of %d vms", __func__,
892	    env->vmd_cfg.parallelism);
893	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
894		if (!(vm->vm_state & VM_STATE_WAITING)) {
895			log_debug("%s: not starting vm %s (disabled)",
896			    __func__,
897			    vm->vm_params.vmc_params.vcp_name);
898			continue;
899		}
900		i++;
901		if (i > env->vmd_cfg.parallelism) {
902			evtimer_add(&staggered_start_timer,
903			    &env->vmd_cfg.delay);
904			break;
905		}
906		vm->vm_state &= ~VM_STATE_WAITING;
907		config_setvm(&env->vmd_ps, vm, -1, vm->vm_params.vmc_owner.uid);
908	}
909	log_debug("%s: done starting vms", __func__);
910}
911
912int
913vmd_configure(void)
914{
915	int			ncpus;
916	struct vmd_switch	*vsw;
917	int ncpu_mib[] = {CTL_HW, HW_NCPUONLINE};
918	size_t ncpus_sz = sizeof(ncpus);
919
920	if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
921		fatal("open %s", PATH_PTMDEV);
922
923	/*
924	 * pledge in the parent process:
925	 * stdio - for malloc and basic I/O including events.
926	 * rpath - for reload to open and read the configuration files.
927	 * wpath - for opening disk images and tap devices.
928	 * tty - for openpty and TIOCUCNTL.
929	 * proc - run kill to terminate its children safely.
930	 * sendfd - for disks, interfaces and other fds.
931	 * recvfd - for send and receive.
932	 * getpw - lookup user or group id by name.
933	 * chown, fattr - change tty ownership
934	 * flock - locking disk files
935	 */
936	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
937	    " chown fattr flock", NULL) == -1)
938		fatal("pledge");
939
940	if (parse_config(env->vmd_conffile) == -1) {
941		proc_kill(&env->vmd_ps);
942		exit(1);
943	}
944
945	if (env->vmd_noaction) {
946		fprintf(stderr, "configuration OK\n");
947		proc_kill(&env->vmd_ps);
948		exit(0);
949	}
950
951	/* Send shared global configuration to all children */
952	if (config_setconfig(env) == -1)
953		return (-1);
954
955	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
956		if (vsw->sw_running)
957			continue;
958		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
959			log_warn("%s: failed to create switch %s",
960			    __func__, vsw->sw_name);
961			switch_remove(vsw);
962			return (-1);
963		}
964	}
965
966	if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) {
967		env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY;
968		if (sysctl(ncpu_mib, nitems(ncpu_mib), &ncpus, &ncpus_sz, NULL, 0) == -1)
969			ncpus = 1;
970		env->vmd_cfg.parallelism = ncpus;
971		log_debug("%s: setting staggered start configuration to "
972		    "parallelism: %d and delay: %lld",
973		    __func__, ncpus, (long long) env->vmd_cfg.delay.tv_sec);
974	}
975
976	log_debug("%s: starting vms in staggered fashion", __func__);
977	evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
978	/* start first batch */
979	start_vm_batch(0, 0, NULL);
980
981	return (0);
982}
983
984int
985vmd_reload(unsigned int reset, const char *filename)
986{
987	struct vmd_vm		*vm, *next_vm;
988	struct vmd_switch	*vsw;
989	int			 reload = 0;
990
991	/* Switch back to the default config file */
992	if (filename == NULL || *filename == '\0') {
993		filename = env->vmd_conffile;
994		reload = 1;
995	}
996
997	log_debug("%s: level %d config file %s", __func__, reset, filename);
998
999	if (reset) {
1000		/* Purge the configuration */
1001		config_purge(env, reset);
1002		config_setreset(env, reset);
1003	} else {
1004		/*
1005		 * Load or reload the configuration.
1006		 *
1007		 * Reloading removes all non-running VMs before processing the
1008		 * config file, whereas loading only adds to the existing list
1009		 * of VMs.
1010		 */
1011
1012		if (reload) {
1013			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
1014			    next_vm) {
1015				if (!(vm->vm_state & VM_STATE_RUNNING)) {
1016					DPRINTF("%s: calling vm_remove",
1017					    __func__);
1018					vm_remove(vm, __func__);
1019				}
1020			}
1021		}
1022
1023		if (parse_config(filename) == -1) {
1024			log_debug("%s: failed to load config file %s",
1025			    __func__, filename);
1026			return (-1);
1027		}
1028
1029		if (reload) {
1030			/* Update shared global configuration in all children */
1031			if (config_setconfig(env) == -1)
1032				return (-1);
1033		}
1034
1035		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1036			if (vsw->sw_running)
1037				continue;
1038			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1039				log_warn("%s: failed to create switch %s",
1040				    __func__, vsw->sw_name);
1041				switch_remove(vsw);
1042				return (-1);
1043			}
1044		}
1045
1046		log_debug("%s: starting vms in staggered fashion", __func__);
1047		evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1048		/* start first batch */
1049		start_vm_batch(0, 0, NULL);
1050
1051		}
1052
1053	return (0);
1054}
1055
1056void
1057vmd_shutdown(void)
1058{
1059	struct vmd_vm *vm, *vm_next;
1060
1061	log_debug("%s: performing shutdown", __func__);
1062
1063	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
1064		vm_remove(vm, __func__);
1065	}
1066
1067	proc_kill(&env->vmd_ps);
1068	free(env);
1069
1070	log_warnx("parent terminating");
1071	exit(0);
1072}
1073
1074struct vmd_vm *
1075vm_getbyvmid(uint32_t vmid)
1076{
1077	struct vmd_vm	*vm;
1078
1079	if (vmid == 0)
1080		return (NULL);
1081	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1082		if (vm->vm_vmid == vmid)
1083			return (vm);
1084	}
1085
1086	return (NULL);
1087}
1088
1089struct vmd_vm *
1090vm_getbyid(uint32_t id)
1091{
1092	struct vmd_vm	*vm;
1093
1094	if (id == 0)
1095		return (NULL);
1096	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1097		if (vm->vm_params.vmc_params.vcp_id == id)
1098			return (vm);
1099	}
1100
1101	return (NULL);
1102}
1103
1104uint32_t
1105vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1106{
1107	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1108		return (0);
1109	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1110	    id, vm->vm_vmid);
1111	return (vm->vm_vmid);
1112}
1113
1114uint32_t
1115vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1116{
1117	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1118		return (0);
1119	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1120	    vmid, vm->vm_params.vmc_params.vcp_id);
1121	return (vm->vm_params.vmc_params.vcp_id);
1122}
1123
1124struct vmd_vm *
1125vm_getbyname(const char *name)
1126{
1127	struct vmd_vm	*vm;
1128
1129	if (name == NULL)
1130		return (NULL);
1131	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1132		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1133			return (vm);
1134	}
1135
1136	return (NULL);
1137}
1138
1139struct vmd_vm *
1140vm_getbypid(pid_t pid)
1141{
1142	struct vmd_vm	*vm;
1143
1144	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1145		if (vm->vm_pid == pid)
1146			return (vm);
1147	}
1148
1149	return (NULL);
1150}
1151
1152void
1153vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1154{
1155	struct privsep	*ps = &env->vmd_ps;
1156	unsigned int	 i, j;
1157
1158	if (vm == NULL)
1159		return;
1160
1161	log_debug("%s: %s %s stopping vm %d%s",
1162	    __func__, ps->ps_title[privsep_process], caller,
1163	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1164
1165	vm->vm_state &= ~(VM_STATE_RUNNING | VM_STATE_SHUTDOWN);
1166
1167	user_inc(&vm->vm_params.vmc_params, vm->vm_user, 0);
1168	user_put(vm->vm_user);
1169
1170	if (vm->vm_iev.ibuf.fd != -1) {
1171		event_del(&vm->vm_iev.ev);
1172		close(vm->vm_iev.ibuf.fd);
1173	}
1174	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
1175		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
1176			if (vm->vm_disks[i][j] != -1) {
1177				close(vm->vm_disks[i][j]);
1178				vm->vm_disks[i][j] = -1;
1179			}
1180		}
1181	}
1182	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
1183		if (vm->vm_ifs[i].vif_fd != -1) {
1184			close(vm->vm_ifs[i].vif_fd);
1185			vm->vm_ifs[i].vif_fd = -1;
1186		}
1187		free(vm->vm_ifs[i].vif_name);
1188		free(vm->vm_ifs[i].vif_switch);
1189		free(vm->vm_ifs[i].vif_group);
1190		vm->vm_ifs[i].vif_name = NULL;
1191		vm->vm_ifs[i].vif_switch = NULL;
1192		vm->vm_ifs[i].vif_group = NULL;
1193	}
1194	if (vm->vm_kernel != -1) {
1195		close(vm->vm_kernel);
1196		vm->vm_kernel = -1;
1197	}
1198	if (vm->vm_cdrom != -1) {
1199		close(vm->vm_cdrom);
1200		vm->vm_cdrom = -1;
1201	}
1202	if (!keeptty) {
1203		vm_closetty(vm);
1204		vm->vm_uid = 0;
1205	}
1206}
1207
1208void
1209vm_remove(struct vmd_vm *vm, const char *caller)
1210{
1211	struct privsep	*ps = &env->vmd_ps;
1212
1213	if (vm == NULL)
1214		return;
1215
1216	log_debug("%s: %s %s removing vm %d from running config",
1217	    __func__, ps->ps_title[privsep_process], caller,
1218	    vm->vm_vmid);
1219
1220	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1221
1222	user_put(vm->vm_user);
1223	vm_stop(vm, 0, caller);
1224	free(vm);
1225}
1226
1227int
1228vm_claimid(const char *name, int uid, uint32_t *id)
1229{
1230	struct name2id *n2i = NULL;
1231
1232	TAILQ_FOREACH(n2i, env->vmd_known, entry)
1233		if (strcmp(n2i->name, name) == 0 && n2i->uid == uid)
1234			goto out;
1235
1236	if (++env->vmd_nvm == 0) {
1237		log_warnx("too many vms");
1238		return -1;
1239	}
1240	if ((n2i = calloc(1, sizeof(struct name2id))) == NULL) {
1241		log_warnx("could not alloc vm name");
1242		return -1;
1243	}
1244	n2i->id = env->vmd_nvm;
1245	n2i->uid = uid;
1246	if (strlcpy(n2i->name, name, sizeof(n2i->name)) >= sizeof(n2i->name)) {
1247		log_warnx("vm name too long");
1248		free(n2i);
1249		return -1;
1250	}
1251	TAILQ_INSERT_TAIL(env->vmd_known, n2i, entry);
1252
1253out:
1254	*id = n2i->id;
1255	return 0;
1256}
1257
1258int
1259vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1260    struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1261{
1262	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1263	struct vm_create_params	*vcp = &vmc->vmc_params;
1264	struct vmop_owner	*vmo = NULL;
1265	struct vmd_user		*usr = NULL;
1266	uint32_t		 nid, rng;
1267	unsigned int		 i, j;
1268	struct vmd_switch	*sw;
1269	char			*s;
1270	int			 ret = 0;
1271
1272	/* Check if this is an instance of another VM */
1273	if ((ret = vm_instance(ps, &vm_parent, vmc, uid)) != 0) {
1274		errno = ret; /* XXX might set invalid errno */
1275		return (-1);
1276	}
1277
1278	errno = 0;
1279	*ret_vm = NULL;
1280
1281	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1282	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1283		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1284		    uid) != 0) {
1285			errno = EPERM;
1286			goto fail;
1287		}
1288		*ret_vm = vm;
1289		errno = EALREADY;
1290		goto fail;
1291	}
1292
1293	if (vm_parent != NULL)
1294		vmo = &vm_parent->vm_params.vmc_insowner;
1295
1296	/* non-root users can only start existing VMs or instances */
1297	if (vm_checkperm(NULL, vmo, uid) != 0) {
1298		log_warnx("permission denied");
1299		errno = EPERM;
1300		goto fail;
1301	}
1302	if (vmc->vmc_flags == 0) {
1303		log_warnx("invalid configuration, no devices");
1304		errno = VMD_DISK_MISSING;
1305		goto fail;
1306	}
1307	if (vcp->vcp_ncpus == 0)
1308		vcp->vcp_ncpus = 1;
1309	if (vcp->vcp_memranges[0].vmr_size == 0)
1310		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1311	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1312		log_warnx("invalid number of CPUs");
1313		goto fail;
1314	} else if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) {
1315		log_warnx("invalid number of disks");
1316		goto fail;
1317	} else if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) {
1318		log_warnx("invalid number of interfaces");
1319		goto fail;
1320	} else if (strlen(vcp->vcp_kernel) == 0 &&
1321	    vcp->vcp_ndisks == 0 && strlen(vcp->vcp_cdrom) == 0) {
1322		log_warnx("no kernel or disk/cdrom specified");
1323		goto fail;
1324	} else if (strlen(vcp->vcp_name) == 0) {
1325		log_warnx("invalid VM name");
1326		goto fail;
1327	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1328	    *vcp->vcp_name == '_') {
1329		log_warnx("invalid VM name");
1330		goto fail;
1331	} else {
1332		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1333			if (!(isalnum(*s) || *s == '.' || *s == '-' ||
1334			    *s == '_')) {
1335				log_warnx("invalid VM name");
1336				goto fail;
1337			}
1338		}
1339	}
1340
1341	/* track active users */
1342	if (uid != 0 && env->vmd_users != NULL &&
1343	    (usr = user_get(uid)) == NULL) {
1344		log_warnx("could not add user");
1345		goto fail;
1346	}
1347
1348	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1349		goto fail;
1350
1351	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1352	vmc = &vm->vm_params;
1353	vcp = &vmc->vmc_params;
1354	vm->vm_pid = -1;
1355	vm->vm_tty = -1;
1356	vm->vm_receive_fd = -1;
1357	vm->vm_state &= ~VM_STATE_PAUSED;
1358	vm->vm_user = usr;
1359
1360	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
1361		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
1362			vm->vm_disks[i][j] = -1;
1363	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
1364		vm->vm_ifs[i].vif_fd = -1;
1365	for (i = 0; i < vcp->vcp_nnics; i++) {
1366		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1367			/* inherit per-interface flags from the switch */
1368			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1369		}
1370
1371		/*
1372		 * If the MAC address is zero, always randomize it in vmd(8)
1373		 * because we cannot rely on the guest OS to do the right
1374		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1375		 * from the kernel, incremented by one to differentiate
1376		 * the source.
1377		 */
1378		if (memcmp(zero_mac, &vcp->vcp_macs[i], ETHER_ADDR_LEN) == 0) {
1379			rng = arc4random();
1380			vcp->vcp_macs[i][0] = 0xfe;
1381			vcp->vcp_macs[i][1] = 0xe1;
1382			vcp->vcp_macs[i][2] = 0xba + 1;
1383			vcp->vcp_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1384			vcp->vcp_macs[i][4] = rng;
1385			vcp->vcp_macs[i][5] = rng >> 8;
1386		}
1387	}
1388	vm->vm_kernel = -1;
1389	vm->vm_cdrom = -1;
1390	vm->vm_iev.ibuf.fd = -1;
1391
1392	/*
1393	 * Assign a new internal Id if not specified and we succeed in
1394	 * claiming a new Id.
1395	 */
1396	if (id != 0)
1397		vm->vm_vmid = id;
1398	else if (vm_claimid(vcp->vcp_name, uid, &nid) == -1)
1399		goto fail;
1400	else
1401		vm->vm_vmid = nid;
1402
1403	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1404	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1405
1406	*ret_vm = vm;
1407	return (0);
1408 fail:
1409	if (errno == 0)
1410		errno = EINVAL;
1411	return (-1);
1412}
1413
1414int
1415vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1416    struct vmop_create_params *vmc, uid_t uid)
1417{
1418	char			*name;
1419	struct vm_create_params	*vcp = &vmc->vmc_params;
1420	struct vmop_create_params *vmcp;
1421	struct vm_create_params	*vcpp;
1422	struct vmd_vm		*vm = NULL;
1423	unsigned int		 i, j;
1424	uint32_t		 id;
1425
1426	/* return without error if the parent is NULL (nothing to inherit) */
1427	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1428	    vmc->vmc_instance[0] == '\0')
1429		return (0);
1430
1431	if ((*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL) {
1432		return (VMD_PARENT_INVALID);
1433	}
1434
1435	vmcp = &(*vm_parent)->vm_params;
1436	vcpp = &vmcp->vmc_params;
1437
1438	/* Are we allowed to create an instance from this VM? */
1439	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1440		log_warnx("vm \"%s\" no permission to create vm instance",
1441		    vcpp->vcp_name);
1442		return (ENAMETOOLONG);
1443	}
1444
1445	id = vcp->vcp_id;
1446	name = vcp->vcp_name;
1447
1448	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1449	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1450		return (EPROCLIM);
1451	}
1452
1453	/* CPU */
1454	if (vcp->vcp_ncpus == 0)
1455		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1456	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1457	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1458		log_warnx("vm \"%s\" no permission to set cpus", name);
1459		return (EPERM);
1460	}
1461
1462	/* memory */
1463	if (vcp->vcp_memranges[0].vmr_size == 0)
1464		vcp->vcp_memranges[0].vmr_size =
1465		    vcpp->vcp_memranges[0].vmr_size;
1466	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1467	    vcp->vcp_memranges[0].vmr_size !=
1468	    vcpp->vcp_memranges[0].vmr_size) {
1469		log_warnx("vm \"%s\" no permission to set memory", name);
1470		return (EPERM);
1471	}
1472
1473	/* disks cannot be inherited */
1474	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1475	    vcp->vcp_ndisks) {
1476		log_warnx("vm \"%s\" no permission to set disks", name);
1477		return (EPERM);
1478	}
1479	for (i = 0; i < vcp->vcp_ndisks; i++) {
1480		/* Check if this disk is already used in the parent */
1481		for (j = 0; j < vcpp->vcp_ndisks; j++) {
1482			if (strcmp(vcp->vcp_disks[i],
1483			    vcpp->vcp_disks[j]) == 0) {
1484				log_warnx("vm \"%s\" disk %s cannot be reused",
1485				    name, vcp->vcp_disks[i]);
1486				return (EBUSY);
1487			}
1488		}
1489		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1490	}
1491
1492	/* interfaces */
1493	if (vcp->vcp_nnics > 0 &&
1494	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1495	    vcp->vcp_nnics != vcpp->vcp_nnics) {
1496		log_warnx("vm \"%s\" no permission to set interfaces", name);
1497		return (EPERM);
1498	}
1499	for (i = 0; i < vcpp->vcp_nnics; i++) {
1500		/* Interface got overwritten */
1501		if (i < vcp->vcp_nnics)
1502			continue;
1503
1504		/* Copy interface from parent */
1505		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1506		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1507		    sizeof(vmc->vmc_ifnames[i]));
1508		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1509		    sizeof(vmc->vmc_ifswitch[i]));
1510		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1511		    sizeof(vmc->vmc_ifgroup[i]));
1512		memcpy(vcp->vcp_macs[i], vcpp->vcp_macs[i],
1513		    sizeof(vcp->vcp_macs[i]));
1514		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1515		vcp->vcp_nnics++;
1516	}
1517	for (i = 0; i < vcp->vcp_nnics; i++) {
1518		for (j = 0; j < vcpp->vcp_nnics; j++) {
1519			if (memcmp(zero_mac, vcp->vcp_macs[i],
1520			    sizeof(vcp->vcp_macs[i])) != 0 &&
1521			    memcmp(vcpp->vcp_macs[i], vcp->vcp_macs[i],
1522			    sizeof(vcp->vcp_macs[i])) != 0) {
1523				log_warnx("vm \"%s\" lladdr cannot be reused",
1524				    name);
1525				return (EBUSY);
1526			}
1527			if (strlen(vmc->vmc_ifnames[i]) &&
1528			    strcmp(vmc->vmc_ifnames[i],
1529			    vmcp->vmc_ifnames[j]) == 0) {
1530				log_warnx("vm \"%s\" %s cannot be reused",
1531				    vmc->vmc_ifnames[i], name);
1532				return (EBUSY);
1533			}
1534		}
1535	}
1536
1537	/* kernel */
1538	if (strlen(vcp->vcp_kernel) > 0) {
1539		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1540			log_warnx("vm \"%s\" no permission to set boot image",
1541			    name);
1542			return (EPERM);
1543		}
1544		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1545	} else if (strlcpy(vcp->vcp_kernel, vcpp->vcp_kernel,
1546	    sizeof(vcp->vcp_kernel)) >= sizeof(vcp->vcp_kernel)) {
1547		log_warnx("vm \"%s\" kernel name too long", name);
1548		return (EINVAL);
1549	}
1550
1551	/* cdrom */
1552	if (strlen(vcp->vcp_cdrom) > 0) {
1553		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1554			log_warnx("vm \"%s\" no permission to set cdrom", name);
1555			return (EPERM);
1556		}
1557		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1558	} else if (strlcpy(vcp->vcp_cdrom, vcpp->vcp_cdrom,
1559	    sizeof(vcp->vcp_cdrom)) >= sizeof(vcp->vcp_cdrom)) {
1560		log_warnx("vm \"%s\" cdrom name too long", name);
1561		return (EINVAL);
1562	}
1563
1564	/* user */
1565	if (vmc->vmc_owner.uid == 0)
1566		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1567	else if (vmc->vmc_owner.uid != uid &&
1568	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1569		log_warnx("vm \"%s\" user mismatch", name);
1570		return (EPERM);
1571	}
1572
1573	/* group */
1574	if (vmc->vmc_owner.gid == 0)
1575		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1576	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1577		log_warnx("vm \"%s\" group mismatch", name);
1578		return (EPERM);
1579	}
1580
1581	/* child instances */
1582	if (vmc->vmc_insflags) {
1583		log_warnx("vm \"%s\" cannot change instance permissions", name);
1584		return (EPERM);
1585	}
1586	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1587		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1588		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1589		vmc->vmc_insflags = vmcp->vmc_insflags;
1590	} else {
1591		vmc->vmc_insowner.gid = 0;
1592		vmc->vmc_insowner.uid = 0;
1593		vmc->vmc_insflags = 0;
1594	}
1595
1596	/* finished, remove instance flags */
1597	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1598
1599	return (0);
1600}
1601
1602/*
1603 * vm_checkperm
1604 *
1605 * Checks if the user represented by the 'uid' parameter is allowed to
1606 * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1607 * console.)
1608 *
1609 * Parameters:
1610 *  vm: the VM whose permission is to be checked
1611 *  vmo: the required uid/gid to be checked
1612 *  uid: the user ID of the user making the request
1613 *
1614 * Return values:
1615 *   0: the permission should be granted
1616 *  -1: the permission check failed (also returned if vm == null)
1617 */
1618int
1619vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1620{
1621	struct group	*gr;
1622	struct passwd	*pw;
1623	char		**grmem;
1624
1625	/* root has no restrictions */
1626	if (uid == 0)
1627		return (0);
1628
1629	if (vmo == NULL)
1630		return (-1);
1631
1632	/* check user */
1633	if (vm == NULL) {
1634		if  (vmo->uid == uid)
1635			return (0);
1636	} else {
1637		/*
1638		 * check user of running vm (the owner of a running vm can
1639		 * be different to (or more specific than) the configured owner.
1640		 */
1641		if (((vm->vm_state & VM_STATE_RUNNING) && vm->vm_uid == uid) ||
1642		    (!(vm->vm_state & VM_STATE_RUNNING) && vmo->uid == uid))
1643			return (0);
1644	}
1645
1646	/* check groups */
1647	if (vmo->gid != -1) {
1648		if ((pw = getpwuid(uid)) == NULL)
1649			return (-1);
1650		if (pw->pw_gid == vmo->gid)
1651			return (0);
1652		if ((gr = getgrgid(vmo->gid)) != NULL) {
1653			for (grmem = gr->gr_mem; *grmem; grmem++)
1654				if (strcmp(*grmem, pw->pw_name) == 0)
1655					return (0);
1656		}
1657	}
1658
1659	return (-1);
1660}
1661
1662/*
1663 * vm_checkinsflag
1664 *
1665 * Checks wheter the non-root user is allowed to set an instance option.
1666 *
1667 * Parameters:
1668 *  vmc: the VM create parameters
1669 *  flag: the flag to be checked
1670 *  uid: the user ID of the user making the request
1671 *
1672 * Return values:
1673 *   0: the permission should be granted
1674 *  -1: the permission check failed (also returned if vm == null)
1675 */
1676int
1677vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1678{
1679	/* root has no restrictions */
1680	if (uid == 0)
1681		return (0);
1682
1683	if ((vmc->vmc_insflags & flag) == 0)
1684		return (-1);
1685
1686	return (0);
1687}
1688
1689/*
1690 * vm_checkaccess
1691 *
1692 * Checks if the user represented by the 'uid' parameter is allowed to
1693 * access the file described by the 'path' parameter.
1694 *
1695 * Parameters:
1696 *  fd: the file descriptor of the opened file
1697 *  uflag: check if the userid has access to the file
1698 *  uid: the user ID of the user making the request
1699 *  amode: the access flags of R_OK and W_OK
1700 *
1701 * Return values:
1702 *   0: the permission should be granted
1703 *  -1: the permission check failed
1704 */
1705int
1706vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1707{
1708	struct group	*gr;
1709	struct passwd	*pw;
1710	char		**grmem;
1711	struct stat	 st;
1712	mode_t		 mode;
1713
1714	if (fd == -1)
1715		return (-1);
1716
1717	/*
1718	 * File has to be accessible and a regular file
1719	 */
1720	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1721		return (-1);
1722
1723	/* root has no restrictions */
1724	if (uid == 0 || uflag == 0)
1725		return (0);
1726
1727	/* check other */
1728	mode = amode & W_OK ? S_IWOTH : 0;
1729	mode |= amode & R_OK ? S_IROTH : 0;
1730	if ((st.st_mode & mode) == mode)
1731		return (0);
1732
1733	/* check user */
1734	mode = amode & W_OK ? S_IWUSR : 0;
1735	mode |= amode & R_OK ? S_IRUSR : 0;
1736	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1737		return (0);
1738
1739	/* check groups */
1740	mode = amode & W_OK ? S_IWGRP : 0;
1741	mode |= amode & R_OK ? S_IRGRP : 0;
1742	if ((st.st_mode & mode) != mode)
1743		return (-1);
1744	if ((pw = getpwuid(uid)) == NULL)
1745		return (-1);
1746	if (pw->pw_gid == st.st_gid)
1747		return (0);
1748	if ((gr = getgrgid(st.st_gid)) != NULL) {
1749		for (grmem = gr->gr_mem; *grmem; grmem++)
1750			if (strcmp(*grmem, pw->pw_name) == 0)
1751				return (0);
1752	}
1753
1754	return (-1);
1755}
1756
1757int
1758vm_opentty(struct vmd_vm *vm)
1759{
1760	struct ptmget		 ptm;
1761	struct stat		 st;
1762	struct group		*gr;
1763	uid_t			 uid;
1764	gid_t			 gid;
1765	mode_t			 mode;
1766	int			 on;
1767
1768	/*
1769	 * Open tty with pre-opened PTM fd
1770	 */
1771	if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1))
1772		return (-1);
1773
1774	/*
1775	 * We use user ioctl(2) mode to pass break commands.
1776	 */
1777	on = 1;
1778	if (ioctl(ptm.cfd, TIOCUCNTL, &on) == -1)
1779		fatal("could not enable user ioctl mode");
1780
1781	vm->vm_tty = ptm.cfd;
1782	close(ptm.sfd);
1783	if ((vm->vm_ttyname = strdup(ptm.sn)) == NULL)
1784		goto fail;
1785
1786	uid = vm->vm_uid;
1787	gid = vm->vm_params.vmc_owner.gid;
1788
1789	if (vm->vm_params.vmc_owner.gid != -1) {
1790		mode = 0660;
1791	} else if ((gr = getgrnam("tty")) != NULL) {
1792		gid = gr->gr_gid;
1793		mode = 0620;
1794	} else {
1795		mode = 0600;
1796		gid = 0;
1797	}
1798
1799	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1800	    __func__, vm->vm_params.vmc_params.vcp_name,
1801	    vm->vm_ttyname, uid, gid, mode);
1802
1803	/*
1804	 * Change ownership and mode of the tty as required.
1805	 * Loosely based on the implementation of sshpty.c
1806	 */
1807	if (stat(vm->vm_ttyname, &st) == -1)
1808		goto fail;
1809
1810	if (st.st_uid != uid || st.st_gid != gid) {
1811		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1812			log_warn("chown %s %d %d failed, uid %d",
1813			    vm->vm_ttyname, uid, gid, getuid());
1814
1815			/* Ignore failure on read-only filesystems */
1816			if (!((errno == EROFS) &&
1817			    (st.st_uid == uid || st.st_uid == 0)))
1818				goto fail;
1819		}
1820	}
1821
1822	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1823		if (chmod(vm->vm_ttyname, mode) == -1) {
1824			log_warn("chmod %s %o failed, uid %d",
1825			    vm->vm_ttyname, mode, getuid());
1826
1827			/* Ignore failure on read-only filesystems */
1828			if (!((errno == EROFS) &&
1829			    (st.st_uid == uid || st.st_uid == 0)))
1830				goto fail;
1831		}
1832	}
1833
1834	return (0);
1835 fail:
1836	vm_closetty(vm);
1837	return (-1);
1838}
1839
1840void
1841vm_closetty(struct vmd_vm *vm)
1842{
1843	if (vm->vm_tty != -1) {
1844		/* Release and close the tty */
1845		if (fchown(vm->vm_tty, 0, 0) == -1)
1846			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1847		if (fchmod(vm->vm_tty, 0666) == -1)
1848			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1849		close(vm->vm_tty);
1850		vm->vm_tty = -1;
1851	}
1852	free(vm->vm_ttyname);
1853	vm->vm_ttyname = NULL;
1854}
1855
1856void
1857switch_remove(struct vmd_switch *vsw)
1858{
1859	if (vsw == NULL)
1860		return;
1861
1862	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1863
1864	free(vsw->sw_group);
1865	free(vsw->sw_name);
1866	free(vsw);
1867}
1868
1869struct vmd_switch *
1870switch_getbyname(const char *name)
1871{
1872	struct vmd_switch	*vsw;
1873
1874	if (name == NULL)
1875		return (NULL);
1876	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1877		if (strcmp(vsw->sw_name, name) == 0)
1878			return (vsw);
1879	}
1880
1881	return (NULL);
1882}
1883
1884struct vmd_user *
1885user_get(uid_t uid)
1886{
1887	struct vmd_user		*usr;
1888
1889	if (uid == 0)
1890		return (NULL);
1891
1892	/* first try to find an existing user */
1893	TAILQ_FOREACH(usr, env->vmd_users, usr_entry) {
1894		if (usr->usr_id.uid == uid)
1895			goto done;
1896	}
1897
1898	if ((usr = calloc(1, sizeof(*usr))) == NULL) {
1899		log_warn("could not allocate user");
1900		return (NULL);
1901	}
1902
1903	usr->usr_id.uid = uid;
1904	usr->usr_id.gid = -1;
1905	TAILQ_INSERT_TAIL(env->vmd_users, usr, usr_entry);
1906
1907 done:
1908	DPRINTF("%s: uid %d #%d +",
1909	    __func__, usr->usr_id.uid, usr->usr_refcnt + 1);
1910	usr->usr_refcnt++;
1911
1912	return (usr);
1913}
1914
1915void
1916user_put(struct vmd_user *usr)
1917{
1918	if (usr == NULL)
1919		return;
1920
1921	DPRINTF("%s: uid %d #%d -",
1922	    __func__, usr->usr_id.uid, usr->usr_refcnt - 1);
1923
1924	if (--usr->usr_refcnt > 0)
1925		return;
1926
1927	TAILQ_REMOVE(env->vmd_users, usr, usr_entry);
1928	free(usr);
1929}
1930
1931void
1932user_inc(struct vm_create_params *vcp, struct vmd_user *usr, int inc)
1933{
1934	char	 mem[FMT_SCALED_STRSIZE];
1935
1936	if (usr == NULL)
1937		return;
1938
1939	/* increment or decrement counters */
1940	inc = inc ? 1 : -1;
1941
1942	usr->usr_maxcpu += vcp->vcp_ncpus * inc;
1943	usr->usr_maxmem += vcp->vcp_memranges[0].vmr_size * inc;
1944	usr->usr_maxifs += vcp->vcp_nnics * inc;
1945
1946	if (log_getverbose() > 1) {
1947		(void)fmt_scaled(usr->usr_maxmem * 1024 * 1024, mem);
1948		log_debug("%s: %c uid %d ref %d cpu %llu mem %s ifs %llu",
1949		    __func__, inc == 1 ? '+' : '-',
1950		    usr->usr_id.uid, usr->usr_refcnt,
1951		    usr->usr_maxcpu, mem, usr->usr_maxifs);
1952	}
1953}
1954
1955int
1956user_checklimit(struct vmd_user *usr, struct vm_create_params *vcp)
1957{
1958	const char	*limit = "";
1959
1960	/* XXX make the limits configurable */
1961	if (usr->usr_maxcpu > VM_DEFAULT_USER_MAXCPU) {
1962		limit = "cpu ";
1963		goto fail;
1964	}
1965	if (usr->usr_maxmem > VM_DEFAULT_USER_MAXMEM) {
1966		limit = "memory ";
1967		goto fail;
1968	}
1969	if (usr->usr_maxifs > VM_DEFAULT_USER_MAXIFS) {
1970		limit = "interface ";
1971		goto fail;
1972	}
1973
1974	return (0);
1975
1976 fail:
1977	log_warnx("%s: user %d %slimit reached", vcp->vcp_name,
1978	    usr->usr_id.uid, limit);
1979	return (-1);
1980}
1981
1982char *
1983get_string(uint8_t *ptr, size_t len)
1984{
1985	size_t	 i;
1986
1987	for (i = 0; i < len; i++)
1988		if (!isprint(ptr[i]))
1989			break;
1990
1991	return strndup(ptr, i);
1992}
1993
1994uint32_t
1995prefixlen2mask(uint8_t prefixlen)
1996{
1997	if (prefixlen == 0)
1998		return (0);
1999
2000	if (prefixlen > 32)
2001		prefixlen = 32;
2002
2003	return (htonl(0xffffffff << (32 - prefixlen)));
2004}
2005
2006void
2007prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask)
2008{
2009	struct in6_addr	 s6;
2010	int		 i;
2011
2012	if (prefixlen > 128)
2013		prefixlen = 128;
2014
2015	memset(&s6, 0, sizeof(s6));
2016	for (i = 0; i < prefixlen / 8; i++)
2017		s6.s6_addr[i] = 0xff;
2018	i = prefixlen % 8;
2019	if (i)
2020		s6.s6_addr[prefixlen / 8] = 0xff00 >> i;
2021
2022	memcpy(mask, &s6, sizeof(s6));
2023}
2024
2025void
2026getmonotime(struct timeval *tv)
2027{
2028	struct timespec	 ts;
2029
2030	if (clock_gettime(CLOCK_MONOTONIC, &ts))
2031		fatal("clock_gettime");
2032
2033	TIMESPEC_TO_TIMEVAL(tv, &ts);
2034}
2035