vmd.c revision 1.106
1/*	$OpenBSD: vmd.c,v 1.106 2018/11/26 05:44:46 ori Exp $	*/
2
3/*
4 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/param.h>	/* nitems */
20#include <sys/queue.h>
21#include <sys/wait.h>
22#include <sys/cdefs.h>
23#include <sys/stat.h>
24#include <sys/tty.h>
25#include <sys/ttycom.h>
26#include <sys/ioctl.h>
27
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31#include <termios.h>
32#include <errno.h>
33#include <event.h>
34#include <fcntl.h>
35#include <pwd.h>
36#include <signal.h>
37#include <syslog.h>
38#include <unistd.h>
39#include <util.h>
40#include <ctype.h>
41#include <pwd.h>
42#include <grp.h>
43
44#include <machine/specialreg.h>
45#include <machine/vmmvar.h>
46
47#include "proc.h"
48#include "atomicio.h"
49#include "vmd.h"
50
51__dead void usage(void);
52
53int	 main(int, char **);
54int	 vmd_configure(void);
55void	 vmd_sighdlr(int sig, short event, void *arg);
56void	 vmd_shutdown(void);
57int	 vmd_control_run(void);
58int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
59int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
60int	 vmd_check_vmh(struct vm_dump_header *);
61
62int	 vm_instance(struct privsep *, struct vmd_vm **,
63	    struct vmop_create_params *, uid_t);
64int	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
65uint32_t vm_claimid(const char *, int);
66
67struct vmd	*env;
68
69static struct privsep_proc procs[] = {
70	/* Keep "priv" on top as procs[0] */
71	{ "priv",	PROC_PRIV,	NULL, priv },
72	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
73	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm, vmm_shutdown },
74};
75
76/* For the privileged process */
77static struct privsep_proc *proc_priv = &procs[0];
78static struct passwd proc_privpw;
79static const uint8_t zero_mac[ETHER_ADDR_LEN];
80
81int
82vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
83{
84	struct privsep			*ps = p->p_ps;
85	int				 res = 0, ret = 0, cmd = 0, verbose;
86	unsigned int			 v = 0, flags;
87	struct vmop_create_params	 vmc;
88	struct vmop_id			 vid;
89	struct vmop_result		 vmr;
90	struct vm_dump_header		 vmh;
91	struct vmd_vm			*vm = NULL;
92	char				*str = NULL;
93	uint32_t			 id = 0;
94	struct control_sock		*rcs;
95
96	switch (imsg->hdr.type) {
97	case IMSG_VMDOP_START_VM_REQUEST:
98		IMSG_SIZE_CHECK(imsg, &vmc);
99		memcpy(&vmc, imsg->data, sizeof(vmc));
100		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
101		if (vmc.vmc_flags == 0) {
102			/* start an existing VM with pre-configured options */
103			if (!(ret == -1 && errno == EALREADY &&
104			    vm->vm_running == 0)) {
105				res = errno;
106				cmd = IMSG_VMDOP_START_VM_RESPONSE;
107			}
108		} else if (ret != 0) {
109			res = errno;
110			cmd = IMSG_VMDOP_START_VM_RESPONSE;
111		}
112		if (res == 0 &&
113		    config_setvm(ps, vm,
114		    imsg->hdr.peerid, vm->vm_params.vmc_owner.uid) == -1) {
115			res = errno;
116			cmd = IMSG_VMDOP_START_VM_RESPONSE;
117		}
118		break;
119	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
120		IMSG_SIZE_CHECK(imsg, &vid);
121		memcpy(&vid, imsg->data, sizeof(vid));
122		flags = vid.vid_flags;
123
124		if ((id = vid.vid_id) == 0) {
125			/* Lookup vm (id) by name */
126			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
127				res = ENOENT;
128				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
129				break;
130			} else if (vm->vm_shutdown &&
131			    (flags & VMOP_FORCE) == 0) {
132				res = EALREADY;
133				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
134				break;
135			} else if (vm->vm_running == 0) {
136				res = EINVAL;
137				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
138				break;
139			}
140			id = vm->vm_vmid;
141		} else if ((vm = vm_getbyvmid(id)) == NULL) {
142			res = ENOENT;
143			cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
144			break;
145		}
146		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
147		    vid.vid_uid) != 0) {
148			res = EPERM;
149			cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
150			break;
151		}
152
153		memset(&vid, 0, sizeof(vid));
154		vid.vid_id = id;
155		vid.vid_flags = flags;
156		if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
157		    imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
158			return (-1);
159		break;
160	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
161		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
162		break;
163	case IMSG_VMDOP_LOAD:
164		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
165		str = get_string((uint8_t *)imsg->data,
166		    IMSG_DATA_SIZE(imsg));
167	case IMSG_VMDOP_RELOAD:
168		if (vmd_reload(0, str) == -1)
169			cmd = IMSG_CTL_FAIL;
170		else
171			cmd = IMSG_CTL_OK;
172		free(str);
173		break;
174	case IMSG_CTL_RESET:
175		IMSG_SIZE_CHECK(imsg, &v);
176		memcpy(&v, imsg->data, sizeof(v));
177		if (vmd_reload(v, NULL) == -1)
178			cmd = IMSG_CTL_FAIL;
179		else
180			cmd = IMSG_CTL_OK;
181		break;
182	case IMSG_CTL_VERBOSE:
183		IMSG_SIZE_CHECK(imsg, &verbose);
184		memcpy(&verbose, imsg->data, sizeof(verbose));
185		log_setverbose(verbose);
186
187		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
188		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
189		cmd = IMSG_CTL_OK;
190		break;
191	case IMSG_VMDOP_PAUSE_VM:
192	case IMSG_VMDOP_UNPAUSE_VM:
193		IMSG_SIZE_CHECK(imsg, &vid);
194		memcpy(&vid, imsg->data, sizeof(vid));
195		if (vid.vid_id == 0) {
196			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
197				res = ENOENT;
198				cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
199				break;
200			} else {
201				vid.vid_id = vm->vm_vmid;
202			}
203		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
204			res = ENOENT;
205			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
206			break;
207		}
208		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
209		    vid.vid_uid) != 0) {
210			res = EPERM;
211			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
212			break;
213		}
214		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
215		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
216		break;
217	case IMSG_VMDOP_SEND_VM_REQUEST:
218		IMSG_SIZE_CHECK(imsg, &vid);
219		memcpy(&vid, imsg->data, sizeof(vid));
220		id = vid.vid_id;
221		if (vid.vid_id == 0) {
222			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
223				res = ENOENT;
224				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
225				close(imsg->fd);
226				break;
227			} else {
228				vid.vid_id = vm->vm_vmid;
229			}
230		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
231			res = ENOENT;
232			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
233			close(imsg->fd);
234			break;
235		} else {
236		}
237		vmr.vmr_id = vid.vid_id;
238		log_debug("%s: sending fd to vmm", __func__);
239		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
240		    imsg->hdr.peerid, imsg->fd, &vid, sizeof(vid));
241		break;
242	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
243		IMSG_SIZE_CHECK(imsg, &vid);
244		memcpy(&vid, imsg->data, sizeof(vid));
245		if (imsg->fd == -1) {
246			log_warnx("%s: invalid fd", __func__);
247			return (-1);
248		}
249		if (atomicio(read, imsg->fd, &vmh, sizeof(vmh)) !=
250		    sizeof(vmh)) {
251			log_warnx("%s: error reading vmh from received vm",
252			    __func__);
253			res = EIO;
254			close(imsg->fd);
255			cmd = IMSG_VMDOP_START_VM_RESPONSE;
256			break;
257		}
258
259		if (vmd_check_vmh(&vmh)) {
260			res = ENOENT;
261			close(imsg->fd);
262			cmd = IMSG_VMDOP_START_VM_RESPONSE;
263			break;
264		}
265		if (atomicio(read, imsg->fd, &vmc, sizeof(vmc)) !=
266		    sizeof(vmc)) {
267			log_warnx("%s: error reading vmc from received vm",
268			    __func__);
269			res = EIO;
270			close(imsg->fd);
271			cmd = IMSG_VMDOP_START_VM_RESPONSE;
272			break;
273		}
274		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
275		    sizeof(vmc.vmc_params.vcp_name));
276		vmc.vmc_params.vcp_id = 0;
277
278		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
279		if (ret != 0) {
280			res = errno;
281			cmd = IMSG_VMDOP_START_VM_RESPONSE;
282			close(imsg->fd);
283		} else {
284			vm->vm_received = 1;
285			config_setvm(ps, vm, imsg->hdr.peerid,
286			    vmc.vmc_owner.uid);
287			log_debug("%s: sending fd to vmm", __func__);
288			proc_compose_imsg(ps, PROC_VMM, -1,
289			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, imsg->fd,
290			    NULL, 0);
291		}
292		break;
293	case IMSG_VMDOP_DONE:
294		control_reset(&ps->ps_csock);
295		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
296			control_reset(rcs);
297		cmd = 0;
298		break;
299	default:
300		return (-1);
301	}
302
303	switch (cmd) {
304	case 0:
305		break;
306	case IMSG_VMDOP_START_VM_RESPONSE:
307	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
308		memset(&vmr, 0, sizeof(vmr));
309		vmr.vmr_result = res;
310		vmr.vmr_id = id;
311		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
312		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
313			return (-1);
314		break;
315	default:
316		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
317		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
318			return (-1);
319		break;
320	}
321
322	return (0);
323}
324
325int
326vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
327{
328	struct vmop_result	 vmr;
329	struct privsep		*ps = p->p_ps;
330	int			 res = 0;
331	struct vmd_vm		*vm;
332	struct vm_create_params	*vcp;
333	struct vmop_info_result	 vir;
334
335	switch (imsg->hdr.type) {
336	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
337		IMSG_SIZE_CHECK(imsg, &vmr);
338		memcpy(&vmr, imsg->data, sizeof(vmr));
339		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
340			break;
341		proc_compose_imsg(ps, PROC_CONTROL, -1,
342		    imsg->hdr.type, imsg->hdr.peerid, -1,
343		    imsg->data, sizeof(imsg->data));
344		log_info("%s: paused vm %d successfully",
345		    vm->vm_params.vmc_params.vcp_name,
346		    vm->vm_vmid);
347		break;
348	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
349		IMSG_SIZE_CHECK(imsg, &vmr);
350		memcpy(&vmr, imsg->data, sizeof(vmr));
351		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
352			break;
353		proc_compose_imsg(ps, PROC_CONTROL, -1,
354		    imsg->hdr.type, imsg->hdr.peerid, -1,
355		    imsg->data, sizeof(imsg->data));
356		log_info("%s: unpaused vm %d successfully.",
357		    vm->vm_params.vmc_params.vcp_name,
358		    vm->vm_vmid);
359		break;
360	case IMSG_VMDOP_START_VM_RESPONSE:
361		IMSG_SIZE_CHECK(imsg, &vmr);
362		memcpy(&vmr, imsg->data, sizeof(vmr));
363		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
364			break;
365		vm->vm_pid = vmr.vmr_pid;
366		vcp = &vm->vm_params.vmc_params;
367		vcp->vcp_id = vmr.vmr_id;
368
369		/*
370		 * If the peerid is not -1, forward the response back to the
371		 * the control socket.  If it is -1, the request originated
372		 * from the parent, not the control socket.
373		 */
374		if (vm->vm_peerid != (uint32_t)-1) {
375			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
376			    sizeof(vmr.vmr_ttyname));
377			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
378			    imsg->hdr.type, vm->vm_peerid, -1,
379			    &vmr, sizeof(vmr)) == -1) {
380				errno = vmr.vmr_result;
381				log_warn("%s: failed to foward vm result",
382				    vcp->vcp_name);
383				vm_remove(vm, __func__);
384				return (-1);
385			}
386		}
387
388		if (vmr.vmr_result) {
389			errno = vmr.vmr_result;
390			log_warn("%s: failed to start vm", vcp->vcp_name);
391			vm_remove(vm, __func__);
392			break;
393		}
394
395		/* Now configure all the interfaces */
396		if (vm_priv_ifconfig(ps, vm) == -1) {
397			log_warn("%s: failed to configure vm", vcp->vcp_name);
398			vm_remove(vm, __func__);
399			break;
400		}
401
402		log_info("%s: started vm %d successfully, tty %s",
403		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
404		break;
405	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
406		IMSG_SIZE_CHECK(imsg, &vmr);
407		memcpy(&vmr, imsg->data, sizeof(vmr));
408		DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
409		    __func__, vmr.vmr_id);
410		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
411		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
412			break;
413		if (vmr.vmr_result == 0) {
414			/* Mark VM as shutting down */
415			vm->vm_shutdown = 1;
416		}
417		break;
418	case IMSG_VMDOP_SEND_VM_RESPONSE:
419		IMSG_SIZE_CHECK(imsg, &vmr);
420		memcpy(&vmr, imsg->data, sizeof(vmr));
421		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
422			break;
423		if (!vmr.vmr_result) {
424			log_info("%s: sent vm %d successfully.",
425			    vm->vm_params.vmc_params.vcp_name,
426			    vm->vm_vmid);
427			if (vm->vm_from_config)
428				vm_stop(vm, 0, __func__);
429			else
430				vm_remove(vm, __func__);
431		}
432
433		/* Send a response if a control client is waiting for it */
434		if (imsg->hdr.peerid != (uint32_t)-1) {
435			/* the error is meaningless for deferred responses */
436			vmr.vmr_result = 0;
437
438			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
439			    IMSG_VMDOP_SEND_VM_RESPONSE,
440			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
441				return (-1);
442		}
443		break;
444	case IMSG_VMDOP_TERMINATE_VM_EVENT:
445		IMSG_SIZE_CHECK(imsg, &vmr);
446		memcpy(&vmr, imsg->data, sizeof(vmr));
447		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
448		    __func__, vmr.vmr_id, vmr.vmr_result);
449		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
450			log_debug("%s: vm %d is no longer available",
451			    __func__, vmr.vmr_id);
452			break;
453		}
454		if (vmr.vmr_result != EAGAIN) {
455			if (vm->vm_from_config)
456				vm_stop(vm, 0, __func__);
457			else
458				vm_remove(vm, __func__);
459		} else {
460			/* Stop VM instance but keep the tty open */
461			vm_stop(vm, 1, __func__);
462			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
463		}
464
465		/* Send a response if a control client is waiting for it */
466		if (imsg->hdr.peerid != (uint32_t)-1) {
467			/* the error is meaningless for deferred responses */
468			vmr.vmr_result = 0;
469
470			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
471			    IMSG_VMDOP_TERMINATE_VM_RESPONSE,
472			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
473				return (-1);
474		}
475		break;
476	case IMSG_VMDOP_GET_INFO_VM_DATA:
477		IMSG_SIZE_CHECK(imsg, &vir);
478		memcpy(&vir, imsg->data, sizeof(vir));
479		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
480			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
481			if (vm->vm_ttyname != NULL)
482				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
483				    sizeof(vir.vir_ttyname));
484			if (vm->vm_shutdown) {
485				/* XXX there might be a nicer way */
486				(void)strlcat(vir.vir_info.vir_name,
487				    " - stopping",
488				    sizeof(vir.vir_info.vir_name));
489			}
490			/* get the user id who started the vm */
491			vir.vir_uid = vm->vm_uid;
492			vir.vir_gid = vm->vm_params.vmc_owner.gid;
493		}
494		if (proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type,
495		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
496			log_debug("%s: GET_INFO_VM failed for vm %d, removing",
497			    __func__, vm->vm_vmid);
498			vm_remove(vm, __func__);
499			return (-1);
500		}
501		break;
502	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
503		/*
504		 * PROC_VMM has responded with the *running* VMs, now we
505		 * append the others. These use the special value 0 for their
506		 * kernel id to indicate that they are not running.
507		 */
508		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
509			if (!vm->vm_running) {
510				memset(&vir, 0, sizeof(vir));
511				vir.vir_info.vir_id = vm->vm_vmid;
512				strlcpy(vir.vir_info.vir_name,
513				    vm->vm_params.vmc_params.vcp_name,
514				    VMM_MAX_NAME_LEN);
515				vir.vir_info.vir_memory_size =
516				    vm->vm_params.vmc_params.
517				    vcp_memranges[0].vmr_size;
518				vir.vir_info.vir_ncpus =
519				    vm->vm_params.vmc_params.vcp_ncpus;
520				/* get the configured user id for this vm */
521				vir.vir_uid = vm->vm_params.vmc_owner.uid;
522				vir.vir_gid = vm->vm_params.vmc_owner.gid;
523				if (proc_compose_imsg(ps, PROC_CONTROL, -1,
524				    IMSG_VMDOP_GET_INFO_VM_DATA,
525				    imsg->hdr.peerid, -1, &vir,
526				    sizeof(vir)) == -1) {
527					log_debug("%s: GET_INFO_VM_END failed",
528					    __func__);
529					vm_remove(vm, __func__);
530					return (-1);
531				}
532			}
533		}
534		IMSG_SIZE_CHECK(imsg, &res);
535		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
536		break;
537	default:
538		return (-1);
539	}
540
541	return (0);
542}
543
544int
545vmd_check_vmh(struct vm_dump_header *vmh)
546{
547	int i;
548	unsigned int code, leaf;
549	unsigned int a, b, c, d;
550
551
552	if (vmh->vmh_version != VM_DUMP_VERSION) {
553		log_warnx("%s: incompatible dump version", __func__);
554		return (-1);
555	}
556
557	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
558		code = vmh->vmh_cpuids[i].code;
559		leaf = vmh->vmh_cpuids[i].leaf;
560		if (leaf != 0x00) {
561			log_debug("%s: invalid leaf 0x%x for code 0x%x",
562			    __func__, leaf, code);
563			return (-1);
564		}
565
566		switch (code) {
567		case 0x00:
568			CPUID_LEAF(code, leaf, a, b, c, d);
569			if (vmh->vmh_cpuids[i].a > a) {
570				log_debug("%s: incompatible cpuid level",
571				    __func__);
572				return (-1);
573			}
574			if (!(vmh->vmh_cpuids[i].b == b &&
575			    vmh->vmh_cpuids[i].c == c &&
576			    vmh->vmh_cpuids[i].d == d)) {
577				log_debug("%s: incompatible cpu brand",
578				    __func__);
579				return (-1);
580			}
581			break;
582
583		case 0x01:
584			CPUID_LEAF(code, leaf, a, b, c, d);
585			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
586			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
587				log_debug("%s: incompatible cpu features "
588				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
589				    code, leaf);
590				return (-1);
591			}
592			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
593			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
594				log_debug("%s: incompatible cpu features "
595				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
596				    code, leaf);
597				return (-1);
598			}
599			break;
600
601		case 0x07:
602			CPUID_LEAF(code, leaf, a, b, c, d);
603			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
604			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
605				log_debug("%s: incompatible cpu features "
606				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
607				    code, leaf);
608				return (-1);
609			}
610			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
611			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
612				log_debug("%s: incompatible cpu features "
613				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
614				    code, leaf);
615				return (-1);
616			}
617			break;
618
619		case 0x0d:
620			CPUID_LEAF(code, leaf, a, b, c, d);
621			if (vmh->vmh_cpuids[i].b > b) {
622				log_debug("%s: incompatible cpu: insufficient "
623				    "max save area for enabled XCR0 features",
624				    __func__);
625				return (-1);
626			}
627			if (vmh->vmh_cpuids[i].c > c) {
628				log_debug("%s: incompatible cpu: insufficient "
629				    "max save area for supported XCR0 features",
630				    __func__);
631				return (-1);
632			}
633			break;
634
635		case 0x80000001:
636			CPUID_LEAF(code, leaf, a, b, c, d);
637			if ((vmh->vmh_cpuids[i].a & a) !=
638			    vmh->vmh_cpuids[i].a) {
639				log_debug("%s: incompatible cpu features "
640				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
641				    code, leaf);
642				return (-1);
643			}
644			if ((vmh->vmh_cpuids[i].c & c) !=
645			    vmh->vmh_cpuids[i].c) {
646				log_debug("%s: incompatible cpu features "
647				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
648				    code, leaf);
649				return (-1);
650			}
651			if ((vmh->vmh_cpuids[i].d & d) !=
652			    vmh->vmh_cpuids[i].d) {
653				log_debug("%s: incompatible cpu features "
654				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
655				    code, leaf);
656				return (-1);
657			}
658			break;
659
660		default:
661			log_debug("%s: unknown code 0x%x", __func__, code);
662			return (-1);
663		}
664	}
665
666	return (0);
667}
668
669void
670vmd_sighdlr(int sig, short event, void *arg)
671{
672	if (privsep_process != PROC_PARENT)
673		return;
674	log_debug("%s: handling signal", __func__);
675
676	switch (sig) {
677	case SIGHUP:
678		log_info("%s: reload requested with SIGHUP", __func__);
679
680		/*
681		 * This is safe because libevent uses async signal handlers
682		 * that run in the event loop and not in signal context.
683		 */
684		(void)vmd_reload(0, NULL);
685		break;
686	case SIGPIPE:
687		log_info("%s: ignoring SIGPIPE", __func__);
688		break;
689	case SIGUSR1:
690		log_info("%s: ignoring SIGUSR1", __func__);
691		break;
692	case SIGTERM:
693	case SIGINT:
694		vmd_shutdown();
695		break;
696	default:
697		fatalx("unexpected signal");
698	}
699}
700
701__dead void
702usage(void)
703{
704	extern char *__progname;
705	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
706	    __progname);
707	exit(1);
708}
709
710int
711main(int argc, char **argv)
712{
713	struct privsep		*ps;
714	int			 ch;
715	const char		*conffile = VMD_CONF;
716	enum privsep_procid	 proc_id = PROC_PARENT;
717	int			 proc_instance = 0;
718	const char		*errp, *title = NULL;
719	int			 argc0 = argc;
720
721	log_init(0, LOG_DAEMON);
722
723	if ((env = calloc(1, sizeof(*env))) == NULL)
724		fatal("calloc: env");
725
726	while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) {
727		switch (ch) {
728		case 'D':
729			if (cmdline_symset(optarg) < 0)
730				log_warnx("could not parse macro definition %s",
731				    optarg);
732			break;
733		case 'd':
734			env->vmd_debug = 2;
735			break;
736		case 'f':
737			conffile = optarg;
738			break;
739		case 'v':
740			env->vmd_verbose++;
741			break;
742		case 'n':
743			env->vmd_noaction = 1;
744			break;
745		case 'P':
746			title = optarg;
747			proc_id = proc_getid(procs, nitems(procs), title);
748			if (proc_id == PROC_MAX)
749				fatalx("invalid process name");
750			break;
751		case 'I':
752			proc_instance = strtonum(optarg, 0,
753			    PROC_MAX_INSTANCES, &errp);
754			if (errp)
755				fatalx("invalid process instance");
756			break;
757		default:
758			usage();
759		}
760	}
761
762	argc -= optind;
763	if (argc > 0)
764		usage();
765
766	if (env->vmd_noaction && !env->vmd_debug)
767		env->vmd_debug = 1;
768
769	/* check for root privileges */
770	if (env->vmd_noaction == 0) {
771		if (geteuid())
772			fatalx("need root privileges");
773	}
774
775	ps = &env->vmd_ps;
776	ps->ps_env = env;
777	env->vmd_fd = -1;
778
779	if (config_init(env) == -1)
780		fatal("failed to initialize configuration");
781
782	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
783		fatal("unknown user %s", VMD_USER);
784
785	/* First proc runs as root without pledge but in default chroot */
786	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
787	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
788
789	/* Open /dev/vmm */
790	if (env->vmd_noaction == 0) {
791		env->vmd_fd = open(VMM_NODE, O_RDWR);
792		if (env->vmd_fd == -1)
793			fatal("%s", VMM_NODE);
794	}
795
796	/* Configure the control socket */
797	ps->ps_csock.cs_name = SOCKET_NAME;
798	TAILQ_INIT(&ps->ps_rcsocks);
799
800	/* Configuration will be parsed after forking the children */
801	env->vmd_conffile = conffile;
802
803	log_init(env->vmd_debug, LOG_DAEMON);
804	log_setverbose(env->vmd_verbose);
805
806	if (env->vmd_noaction)
807		ps->ps_noaction = 1;
808	ps->ps_instance = proc_instance;
809	if (title != NULL)
810		ps->ps_title[proc_id] = title;
811
812	/* only the parent returns */
813	proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv,
814	    proc_id);
815
816	log_procinit("parent");
817	if (!env->vmd_debug && daemon(0, 0) == -1)
818		fatal("can't daemonize");
819
820	if (ps->ps_noaction == 0)
821		log_info("startup");
822
823	event_init();
824
825	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
826	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
827	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
828	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
829	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
830
831	signal_add(&ps->ps_evsigint, NULL);
832	signal_add(&ps->ps_evsigterm, NULL);
833	signal_add(&ps->ps_evsighup, NULL);
834	signal_add(&ps->ps_evsigpipe, NULL);
835	signal_add(&ps->ps_evsigusr1, NULL);
836
837	if (!env->vmd_noaction)
838		proc_connect(ps);
839
840	if (vmd_configure() == -1)
841		fatalx("configuration failed");
842
843	event_dispatch();
844
845	log_debug("parent exiting");
846
847	return (0);
848}
849
850int
851vmd_configure(void)
852{
853	struct vmd_vm		*vm;
854	struct vmd_switch	*vsw;
855
856	if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
857		fatal("open %s", PATH_PTMDEV);
858
859	/*
860	 * pledge in the parent process:
861	 * stdio - for malloc and basic I/O including events.
862	 * rpath - for reload to open and read the configuration files.
863	 * wpath - for opening disk images and tap devices.
864	 * tty - for openpty and TIOCUCNTL.
865	 * proc - run kill to terminate its children safely.
866	 * sendfd - for disks, interfaces and other fds.
867	 * recvfd - for send and receive.
868	 * getpw - lookup user or group id by name.
869	 * chown, fattr - change tty ownership
870	 * flock - locking disk files
871	 */
872	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
873	    " chown fattr flock", NULL) == -1)
874		fatal("pledge");
875
876	if (parse_config(env->vmd_conffile) == -1) {
877		proc_kill(&env->vmd_ps);
878		exit(1);
879	}
880
881	if (env->vmd_noaction) {
882		fprintf(stderr, "configuration OK\n");
883		proc_kill(&env->vmd_ps);
884		exit(0);
885	}
886
887	/* Send shared global configuration to all children */
888	if (config_setconfig(env) == -1)
889		return (-1);
890
891	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
892		if (vsw->sw_running)
893			continue;
894		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
895			log_warn("%s: failed to create switch %s",
896			    __func__, vsw->sw_name);
897			switch_remove(vsw);
898			return (-1);
899		}
900	}
901
902	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
903		if (vm->vm_disabled) {
904			log_debug("%s: not creating vm %s (disabled)",
905			    __func__,
906			    vm->vm_params.vmc_params.vcp_name);
907			continue;
908		}
909		if (config_setvm(&env->vmd_ps, vm,
910		    -1, vm->vm_params.vmc_owner.uid) == -1)
911			return (-1);
912	}
913
914	return (0);
915}
916
917int
918vmd_reload(unsigned int reset, const char *filename)
919{
920	struct vmd_vm		*vm, *next_vm;
921	struct vmd_switch	*vsw;
922	int			 reload = 0;
923
924	/* Switch back to the default config file */
925	if (filename == NULL || *filename == '\0') {
926		filename = env->vmd_conffile;
927		reload = 1;
928	}
929
930	log_debug("%s: level %d config file %s", __func__, reset, filename);
931
932	if (reset) {
933		/* Purge the configuration */
934		config_purge(env, reset);
935		config_setreset(env, reset);
936	} else {
937		/*
938		 * Load or reload the configuration.
939		 *
940		 * Reloading removes all non-running VMs before processing the
941		 * config file, whereas loading only adds to the existing list
942		 * of VMs.
943		 */
944
945		if (reload) {
946			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
947			    next_vm) {
948				if (vm->vm_running == 0) {
949					DPRINTF("%s: calling vm_remove",
950					    __func__);
951					vm_remove(vm, __func__);
952				}
953			}
954		}
955
956		if (parse_config(filename) == -1) {
957			log_debug("%s: failed to load config file %s",
958			    __func__, filename);
959			return (-1);
960		}
961
962		if (reload) {
963			/* Update shared global configuration in all children */
964			if (config_setconfig(env) == -1)
965				return (-1);
966		}
967
968		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
969			if (vsw->sw_running)
970				continue;
971			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
972				log_warn("%s: failed to create switch %s",
973				    __func__, vsw->sw_name);
974				switch_remove(vsw);
975				return (-1);
976			}
977		}
978
979		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
980			if (vm->vm_running == 0) {
981				if (vm->vm_disabled) {
982					log_debug("%s: not creating vm %s"
983					    " (disabled)", __func__,
984					    vm->vm_params.vmc_params.vcp_name);
985					continue;
986				}
987				if (config_setvm(&env->vmd_ps, vm,
988				    -1, vm->vm_params.vmc_owner.uid) == -1)
989					return (-1);
990			} else {
991				log_debug("%s: not creating vm \"%s\": "
992				    "(running)", __func__,
993				    vm->vm_params.vmc_params.vcp_name);
994			}
995		}
996	}
997
998	return (0);
999}
1000
1001void
1002vmd_shutdown(void)
1003{
1004	struct vmd_vm *vm, *vm_next;
1005
1006	log_debug("%s: performing shutdown", __func__);
1007
1008	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
1009		vm_remove(vm, __func__);
1010	}
1011
1012	proc_kill(&env->vmd_ps);
1013	free(env);
1014
1015	log_warnx("parent terminating");
1016	exit(0);
1017}
1018
1019struct vmd_vm *
1020vm_getbyvmid(uint32_t vmid)
1021{
1022	struct vmd_vm	*vm;
1023
1024	if (vmid == 0)
1025		return (NULL);
1026	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1027		if (vm->vm_vmid == vmid)
1028			return (vm);
1029	}
1030
1031	return (NULL);
1032}
1033
1034struct vmd_vm *
1035vm_getbyid(uint32_t id)
1036{
1037	struct vmd_vm	*vm;
1038
1039	if (id == 0)
1040		return (NULL);
1041	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1042		if (vm->vm_params.vmc_params.vcp_id == id)
1043			return (vm);
1044	}
1045
1046	return (NULL);
1047}
1048
1049uint32_t
1050vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1051{
1052	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1053		return (0);
1054	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1055	    id, vm->vm_vmid);
1056	return (vm->vm_vmid);
1057}
1058
1059uint32_t
1060vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1061{
1062	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1063		return (0);
1064	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1065	    vmid, vm->vm_params.vmc_params.vcp_id);
1066	return (vm->vm_params.vmc_params.vcp_id);
1067}
1068
1069struct vmd_vm *
1070vm_getbyname(const char *name)
1071{
1072	struct vmd_vm	*vm;
1073
1074	if (name == NULL)
1075		return (NULL);
1076	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1077		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1078			return (vm);
1079	}
1080
1081	return (NULL);
1082}
1083
1084struct vmd_vm *
1085vm_getbypid(pid_t pid)
1086{
1087	struct vmd_vm	*vm;
1088
1089	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1090		if (vm->vm_pid == pid)
1091			return (vm);
1092	}
1093
1094	return (NULL);
1095}
1096
1097void
1098vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1099{
1100	struct privsep	*ps = &env->vmd_ps;
1101	unsigned int	 i, j;
1102
1103	if (vm == NULL)
1104		return;
1105
1106	log_debug("%s: %s %s stopping vm %d%s",
1107	    __func__, ps->ps_title[privsep_process], caller,
1108	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1109
1110	vm->vm_running = 0;
1111	vm->vm_shutdown = 0;
1112
1113	user_inc(&vm->vm_params.vmc_params, vm->vm_user, 0);
1114	user_put(vm->vm_user);
1115
1116	if (vm->vm_iev.ibuf.fd != -1) {
1117		event_del(&vm->vm_iev.ev);
1118		close(vm->vm_iev.ibuf.fd);
1119	}
1120	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
1121		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
1122			if (vm->vm_disks[i][j] != -1) {
1123				close(vm->vm_disks[i][j]);
1124				vm->vm_disks[i][j] = -1;
1125			}
1126		}
1127	}
1128	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
1129		if (vm->vm_ifs[i].vif_fd != -1) {
1130			close(vm->vm_ifs[i].vif_fd);
1131			vm->vm_ifs[i].vif_fd = -1;
1132		}
1133		free(vm->vm_ifs[i].vif_name);
1134		free(vm->vm_ifs[i].vif_switch);
1135		free(vm->vm_ifs[i].vif_group);
1136		vm->vm_ifs[i].vif_name = NULL;
1137		vm->vm_ifs[i].vif_switch = NULL;
1138		vm->vm_ifs[i].vif_group = NULL;
1139	}
1140	if (vm->vm_kernel != -1) {
1141		close(vm->vm_kernel);
1142		vm->vm_kernel = -1;
1143	}
1144	if (vm->vm_cdrom != -1) {
1145		close(vm->vm_cdrom);
1146		vm->vm_cdrom = -1;
1147	}
1148	if (!keeptty) {
1149		vm_closetty(vm);
1150		vm->vm_uid = 0;
1151	}
1152}
1153
1154void
1155vm_remove(struct vmd_vm *vm, const char *caller)
1156{
1157	struct privsep	*ps = &env->vmd_ps;
1158
1159	if (vm == NULL)
1160		return;
1161
1162	log_debug("%s: %s %s removing vm %d from running config",
1163	    __func__, ps->ps_title[privsep_process], caller,
1164	    vm->vm_vmid);
1165
1166	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1167
1168	user_put(vm->vm_user);
1169	vm_stop(vm, 0, caller);
1170	free(vm);
1171}
1172
1173uint32_t
1174vm_claimid(const char *name, int uid)
1175{
1176	struct name2id *n2i = NULL;
1177
1178	TAILQ_FOREACH(n2i, env->vmd_known, entry)
1179		if (strcmp(n2i->name, name) == 0 && n2i->uid == uid)
1180			return n2i->id;
1181
1182	if (++env->vmd_nvm == 0)
1183		fatalx("too many vms");
1184	if ((n2i = calloc(1, sizeof(struct name2id))) == NULL)
1185		fatalx("could not alloc vm name");
1186	n2i->id = env->vmd_nvm;
1187	n2i->uid = uid;
1188	if (strlcpy(n2i->name, name, sizeof(n2i->name)) >= sizeof(n2i->name))
1189		fatalx("overlong vm name");
1190	TAILQ_INSERT_TAIL(env->vmd_known, n2i, entry);
1191
1192	return n2i->id;
1193}
1194
1195int
1196vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1197    struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1198{
1199	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1200	struct vm_create_params	*vcp = &vmc->vmc_params;
1201	struct vmop_owner	*vmo = NULL;
1202	struct vmd_user		*usr = NULL;
1203	uint32_t		 rng;
1204	unsigned int		 i, j;
1205	struct vmd_switch	*sw;
1206	char			*s;
1207
1208	/* Check if this is an instance of another VM */
1209	if (vm_instance(ps, &vm_parent, vmc, uid) == -1)
1210		return (-1);
1211
1212	errno = 0;
1213	*ret_vm = NULL;
1214
1215	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1216	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1217		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1218		    uid) != 0) {
1219			errno = EPERM;
1220			goto fail;
1221		}
1222		*ret_vm = vm;
1223		errno = EALREADY;
1224		goto fail;
1225	}
1226
1227	if (vm_parent != NULL)
1228		vmo = &vm_parent->vm_params.vmc_insowner;
1229
1230	/* non-root users can only start existing VMs or instances */
1231	if (vm_checkperm(NULL, vmo, uid) != 0) {
1232		log_warnx("permission denied");
1233		errno = EPERM;
1234		goto fail;
1235	}
1236	if (vmc->vmc_flags == 0) {
1237		log_warnx("invalid configuration, no devices");
1238		errno = VMD_DISK_MISSING;
1239		goto fail;
1240	}
1241	if (vcp->vcp_ncpus == 0)
1242		vcp->vcp_ncpus = 1;
1243	if (vcp->vcp_memranges[0].vmr_size == 0)
1244		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1245	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1246		log_warnx("invalid number of CPUs");
1247		goto fail;
1248	} else if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) {
1249		log_warnx("invalid number of disks");
1250		goto fail;
1251	} else if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) {
1252		log_warnx("invalid number of interfaces");
1253		goto fail;
1254	} else if (strlen(vcp->vcp_kernel) == 0 &&
1255	    vcp->vcp_ndisks == 0 && strlen(vcp->vcp_cdrom) == 0) {
1256		log_warnx("no kernel or disk/cdrom specified");
1257		goto fail;
1258	} else if (strlen(vcp->vcp_name) == 0) {
1259		log_warnx("invalid VM name");
1260		goto fail;
1261	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1262	    *vcp->vcp_name == '_') {
1263		log_warnx("invalid VM name");
1264		goto fail;
1265	} else {
1266		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1267			if (!(isalnum(*s) || *s == '.' || *s == '-' ||
1268			    *s == '_')) {
1269				log_warnx("invalid VM name");
1270				goto fail;
1271			}
1272		}
1273	}
1274
1275	/* track active users */
1276	if (uid != 0 && env->vmd_users != NULL &&
1277	    (usr = user_get(uid)) == NULL) {
1278		log_warnx("could not add user");
1279		goto fail;
1280	}
1281
1282	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1283		goto fail;
1284
1285	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1286	vmc = &vm->vm_params;
1287	vcp = &vmc->vmc_params;
1288	vm->vm_pid = -1;
1289	vm->vm_tty = -1;
1290	vm->vm_receive_fd = -1;
1291	vm->vm_paused = 0;
1292	vm->vm_user = usr;
1293
1294	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
1295		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
1296			vm->vm_disks[i][j] = -1;
1297	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
1298		vm->vm_ifs[i].vif_fd = -1;
1299	for (i = 0; i < vcp->vcp_nnics; i++) {
1300		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1301			/* inherit per-interface flags from the switch */
1302			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1303		}
1304
1305		/*
1306		 * If the MAC address is zero, always randomize it in vmd(8)
1307		 * because we cannot rely on the guest OS to do the right
1308		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1309		 * from the kernel, incremented by one to differentiate
1310		 * the source.
1311		 */
1312		if (memcmp(zero_mac, &vcp->vcp_macs[i], ETHER_ADDR_LEN) == 0) {
1313			rng = arc4random();
1314			vcp->vcp_macs[i][0] = 0xfe;
1315			vcp->vcp_macs[i][1] = 0xe1;
1316			vcp->vcp_macs[i][2] = 0xba + 1;
1317			vcp->vcp_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1318			vcp->vcp_macs[i][4] = rng;
1319			vcp->vcp_macs[i][5] = rng >> 8;
1320		}
1321	}
1322	vm->vm_kernel = -1;
1323	vm->vm_cdrom = -1;
1324	vm->vm_iev.ibuf.fd = -1;
1325
1326	/* Assign a new internal Id if not specified */
1327	vm->vm_vmid = (id == 0) ? vm_claimid(vcp->vcp_name, uid) : id;
1328
1329	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1330	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1331
1332	*ret_vm = vm;
1333	return (0);
1334 fail:
1335	if (errno == 0)
1336		errno = EINVAL;
1337	return (-1);
1338}
1339
1340int
1341vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1342    struct vmop_create_params *vmc, uid_t uid)
1343{
1344	char			*name;
1345	struct vm_create_params	*vcp = &vmc->vmc_params;
1346	struct vmop_create_params *vmcp;
1347	struct vm_create_params	*vcpp;
1348	struct vmd_vm		*vm = NULL;
1349	unsigned int		 i, j;
1350	uint32_t		 id;
1351
1352	/* return without error if the parent is NULL (nothing to inherit) */
1353	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1354	    (*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL)
1355		return (0);
1356
1357	errno = 0;
1358	vmcp = &(*vm_parent)->vm_params;
1359	vcpp = &vmcp->vmc_params;
1360
1361	/* Are we allowed to create an instance from this VM? */
1362	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1363		log_warnx("vm \"%s\" no permission to create vm instance",
1364		    vcpp->vcp_name);
1365		errno = ENAMETOOLONG;
1366		return (-1);
1367	}
1368
1369	id = vcp->vcp_id;
1370	name = vcp->vcp_name;
1371
1372	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1373	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1374		errno = EPROCLIM;
1375		return (-1);
1376	}
1377
1378	/* CPU */
1379	if (vcp->vcp_ncpus == 0)
1380		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1381	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1382	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1383		log_warnx("vm \"%s\" no permission to set cpus", name);
1384		errno = EPERM;
1385		return (-1);
1386	}
1387
1388	/* memory */
1389	if (vcp->vcp_memranges[0].vmr_size == 0)
1390		vcp->vcp_memranges[0].vmr_size =
1391		    vcpp->vcp_memranges[0].vmr_size;
1392	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1393	    vcp->vcp_memranges[0].vmr_size !=
1394	    vcpp->vcp_memranges[0].vmr_size) {
1395		log_warnx("vm \"%s\" no permission to set memory", name);
1396		errno = EPERM;
1397		return (-1);
1398	}
1399
1400	/* disks cannot be inherited */
1401	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1402	    vcp->vcp_ndisks) {
1403		log_warnx("vm \"%s\" no permission to set disks", name);
1404		errno = EPERM;
1405		return (-1);
1406	}
1407	for (i = 0; i < vcp->vcp_ndisks; i++) {
1408		/* Check if this disk is already used in the parent */
1409		for (j = 0; j < vcpp->vcp_ndisks; j++) {
1410			if (strcmp(vcp->vcp_disks[i],
1411			    vcpp->vcp_disks[j]) == 0) {
1412				log_warnx("vm \"%s\" disk %s cannot be reused",
1413				    name, vcp->vcp_disks[i]);
1414				errno = EBUSY;
1415				return (-1);
1416			}
1417		}
1418		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1419	}
1420
1421	/* interfaces */
1422	if (vcp->vcp_nnics > 0 &&
1423	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1424	    vcp->vcp_nnics != vcpp->vcp_nnics) {
1425		log_warnx("vm \"%s\" no permission to set interfaces", name);
1426		errno = EPERM;
1427		return (-1);
1428	}
1429	for (i = 0; i < vcpp->vcp_nnics; i++) {
1430		/* Interface got overwritten */
1431		if (i < vcp->vcp_nnics)
1432			continue;
1433
1434		/* Copy interface from parent */
1435		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1436		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1437		    sizeof(vmc->vmc_ifnames[i]));
1438		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1439		    sizeof(vmc->vmc_ifswitch[i]));
1440		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1441		    sizeof(vmc->vmc_ifgroup[i]));
1442		memcpy(vcp->vcp_macs[i], vcpp->vcp_macs[i],
1443		    sizeof(vcp->vcp_macs[i]));
1444		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1445		vcp->vcp_nnics++;
1446	}
1447	for (i = 0; i < vcp->vcp_nnics; i++) {
1448		for (j = 0; j < vcpp->vcp_nnics; j++) {
1449			if (memcmp(zero_mac, vcp->vcp_macs[i],
1450			    sizeof(vcp->vcp_macs[i])) != 0 &&
1451			    memcmp(vcpp->vcp_macs[i], vcp->vcp_macs[i],
1452			    sizeof(vcp->vcp_macs[i])) != 0) {
1453				log_warnx("vm \"%s\" lladdr cannot be reused",
1454				    name);
1455				errno = EBUSY;
1456				return (-1);
1457			}
1458			if (strlen(vmc->vmc_ifnames[i]) &&
1459			    strcmp(vmc->vmc_ifnames[i],
1460			    vmcp->vmc_ifnames[j]) == 0) {
1461				log_warnx("vm \"%s\" %s cannot be reused",
1462				    vmc->vmc_ifnames[i], name);
1463				errno = EBUSY;
1464				return (-1);
1465			}
1466		}
1467	}
1468
1469	/* kernel */
1470	if (strlen(vcp->vcp_kernel) > 0) {
1471		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1472			log_warnx("vm \"%s\" no permission to set boot image",
1473			    name);
1474			errno = EPERM;
1475			return (-1);
1476		}
1477		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1478	} else if (strlcpy(vcp->vcp_kernel, vcpp->vcp_kernel,
1479	    sizeof(vcp->vcp_kernel)) >= sizeof(vcp->vcp_kernel)) {
1480		log_warnx("vm \"%s\" kernel name too long", name);
1481		errno = EINVAL;
1482		return (-1);
1483	}
1484
1485	/* cdrom */
1486	if (strlen(vcp->vcp_cdrom) > 0) {
1487		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1488			log_warnx("vm \"%s\" no permission to set cdrom", name);
1489			errno = EPERM;
1490			return (-1);
1491		}
1492		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1493	} else if (strlcpy(vcp->vcp_cdrom, vcpp->vcp_cdrom,
1494	    sizeof(vcp->vcp_cdrom)) >= sizeof(vcp->vcp_cdrom)) {
1495		log_warnx("vm \"%s\" cdrom name too long", name);
1496		errno = EINVAL;
1497		return (-1);
1498	}
1499
1500	/* user */
1501	if (vmc->vmc_owner.uid == 0)
1502		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1503	else if (vmc->vmc_owner.uid != uid &&
1504	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1505		log_warnx("vm \"%s\" user mismatch", name);
1506		errno = EPERM;
1507		return (-1);
1508	}
1509
1510	/* group */
1511	if (vmc->vmc_owner.gid == 0)
1512		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1513	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1514		log_warnx("vm \"%s\" group mismatch", name);
1515		errno = EPERM;
1516		return (-1);
1517	}
1518
1519	/* child instances */
1520	if (vmc->vmc_insflags) {
1521		log_warnx("vm \"%s\" cannot change instance permissions", name);
1522		errno = EPERM;
1523		return (-1);
1524	}
1525	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1526		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1527		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1528		vmc->vmc_insflags = vmcp->vmc_insflags;
1529	} else {
1530		vmc->vmc_insowner.gid = 0;
1531		vmc->vmc_insowner.uid = 0;
1532		vmc->vmc_insflags = 0;
1533	}
1534
1535	/* finished, remove instance flags */
1536	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1537
1538	return (0);
1539}
1540
1541/*
1542 * vm_checkperm
1543 *
1544 * Checks if the user represented by the 'uid' parameter is allowed to
1545 * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1546 * console.)
1547 *
1548 * Parameters:
1549 *  vm: the VM whose permission is to be checked
1550 *  vmo: the required uid/gid to be checked
1551 *  uid: the user ID of the user making the request
1552 *
1553 * Return values:
1554 *   0: the permission should be granted
1555 *  -1: the permission check failed (also returned if vm == null)
1556 */
1557int
1558vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1559{
1560	struct group	*gr;
1561	struct passwd	*pw;
1562	char		**grmem;
1563
1564	/* root has no restrictions */
1565	if (uid == 0)
1566		return (0);
1567
1568	if (vmo == NULL)
1569		return (-1);
1570
1571	/* check user */
1572	if (vm == NULL) {
1573		if  (vmo->uid == uid)
1574			return (0);
1575	} else {
1576		/*
1577		 * check user of running vm (the owner of a running vm can
1578		 * be different to (or more specific than) the configured owner.
1579		 */
1580		if ((vm->vm_running && vm->vm_uid == uid) ||
1581		    (!vm->vm_running && vmo->uid == uid))
1582			return (0);
1583	}
1584
1585	/* check groups */
1586	if (vmo->gid != -1) {
1587		if ((pw = getpwuid(uid)) == NULL)
1588			return (-1);
1589		if (pw->pw_gid == vmo->gid)
1590			return (0);
1591		if ((gr = getgrgid(vmo->gid)) != NULL) {
1592			for (grmem = gr->gr_mem; *grmem; grmem++)
1593				if (strcmp(*grmem, pw->pw_name) == 0)
1594					return (0);
1595		}
1596	}
1597
1598	return (-1);
1599}
1600
1601/*
1602 * vm_checkinsflag
1603 *
1604 * Checks wheter the non-root user is allowed to set an instance option.
1605 *
1606 * Parameters:
1607 *  vmc: the VM create parameters
1608 *  flag: the flag to be checked
1609 *  uid: the user ID of the user making the request
1610 *
1611 * Return values:
1612 *   0: the permission should be granted
1613 *  -1: the permission check failed (also returned if vm == null)
1614 */
1615int
1616vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1617{
1618	/* root has no restrictions */
1619	if (uid == 0)
1620		return (0);
1621
1622	if ((vmc->vmc_insflags & flag) == 0)
1623		return (-1);
1624
1625	return (0);
1626}
1627
1628/*
1629 * vm_checkaccess
1630 *
1631 * Checks if the user represented by the 'uid' parameter is allowed to
1632 * access the file described by the 'path' parameter.
1633 *
1634 * Parameters:
1635 *  fd: the file descriptor of the opened file
1636 *  uflag: check if the userid has access to the file
1637 *  uid: the user ID of the user making the request
1638 *  amode: the access flags of R_OK and W_OK
1639 *
1640 * Return values:
1641 *   0: the permission should be granted
1642 *  -1: the permission check failed
1643 */
1644int
1645vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1646{
1647	struct group	*gr;
1648	struct passwd	*pw;
1649	char		**grmem;
1650	struct stat	 st;
1651	mode_t		 mode;
1652
1653	if (fd == -1)
1654		return (-1);
1655
1656	/*
1657	 * File has to be accessible and a regular file
1658	 */
1659	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1660		return (-1);
1661
1662	/* root has no restrictions */
1663	if (uid == 0 || uflag == 0)
1664		return (0);
1665
1666	/* check other */
1667	mode = amode & W_OK ? S_IWOTH : 0;
1668	mode |= amode & R_OK ? S_IROTH : 0;
1669	if ((st.st_mode & mode) == mode)
1670		return (0);
1671
1672	/* check user */
1673	mode = amode & W_OK ? S_IWUSR : 0;
1674	mode |= amode & R_OK ? S_IRUSR : 0;
1675	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1676		return (0);
1677
1678	/* check groups */
1679	mode = amode & W_OK ? S_IWGRP : 0;
1680	mode |= amode & R_OK ? S_IRGRP : 0;
1681	if ((st.st_mode & mode) != mode)
1682		return (-1);
1683	if ((pw = getpwuid(uid)) == NULL)
1684		return (-1);
1685	if (pw->pw_gid == st.st_gid)
1686		return (0);
1687	if ((gr = getgrgid(st.st_gid)) != NULL) {
1688		for (grmem = gr->gr_mem; *grmem; grmem++)
1689			if (strcmp(*grmem, pw->pw_name) == 0)
1690				return (0);
1691	}
1692
1693	return (-1);
1694}
1695
1696int
1697vm_opentty(struct vmd_vm *vm)
1698{
1699	struct ptmget		 ptm;
1700	struct stat		 st;
1701	struct group		*gr;
1702	uid_t			 uid;
1703	gid_t			 gid;
1704	mode_t			 mode;
1705	int			 on;
1706
1707	/*
1708	 * Open tty with pre-opened PTM fd
1709	 */
1710	if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1))
1711		return (-1);
1712
1713	/*
1714	 * We use user ioctl(2) mode to pass break commands.
1715	 */
1716	on = 1;
1717	if (ioctl(ptm.cfd, TIOCUCNTL, &on))
1718		fatal("could not enable user ioctl mode");
1719
1720	vm->vm_tty = ptm.cfd;
1721	close(ptm.sfd);
1722	if ((vm->vm_ttyname = strdup(ptm.sn)) == NULL)
1723		goto fail;
1724
1725	uid = vm->vm_uid;
1726	gid = vm->vm_params.vmc_owner.gid;
1727
1728	if (vm->vm_params.vmc_owner.gid != -1) {
1729		mode = 0660;
1730	} else if ((gr = getgrnam("tty")) != NULL) {
1731		gid = gr->gr_gid;
1732		mode = 0620;
1733	} else {
1734		mode = 0600;
1735		gid = 0;
1736	}
1737
1738	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1739	    __func__, vm->vm_params.vmc_params.vcp_name,
1740	    vm->vm_ttyname, uid, gid, mode);
1741
1742	/*
1743	 * Change ownership and mode of the tty as required.
1744	 * Loosely based on the implementation of sshpty.c
1745	 */
1746	if (stat(vm->vm_ttyname, &st) == -1)
1747		goto fail;
1748
1749	if (st.st_uid != uid || st.st_gid != gid) {
1750		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1751			log_warn("chown %s %d %d failed, uid %d",
1752			    vm->vm_ttyname, uid, gid, getuid());
1753
1754			/* Ignore failure on read-only filesystems */
1755			if (!((errno == EROFS) &&
1756			    (st.st_uid == uid || st.st_uid == 0)))
1757				goto fail;
1758		}
1759	}
1760
1761	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1762		if (chmod(vm->vm_ttyname, mode) == -1) {
1763			log_warn("chmod %s %o failed, uid %d",
1764			    vm->vm_ttyname, mode, getuid());
1765
1766			/* Ignore failure on read-only filesystems */
1767			if (!((errno == EROFS) &&
1768			    (st.st_uid == uid || st.st_uid == 0)))
1769				goto fail;
1770		}
1771	}
1772
1773	return (0);
1774 fail:
1775	vm_closetty(vm);
1776	return (-1);
1777}
1778
1779void
1780vm_closetty(struct vmd_vm *vm)
1781{
1782	if (vm->vm_tty != -1) {
1783		/* Release and close the tty */
1784		if (fchown(vm->vm_tty, 0, 0) == -1)
1785			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1786		if (fchmod(vm->vm_tty, 0666) == -1)
1787			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1788		close(vm->vm_tty);
1789		vm->vm_tty = -1;
1790	}
1791	free(vm->vm_ttyname);
1792	vm->vm_ttyname = NULL;
1793}
1794
1795void
1796switch_remove(struct vmd_switch *vsw)
1797{
1798	if (vsw == NULL)
1799		return;
1800
1801	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1802
1803	free(vsw->sw_group);
1804	free(vsw->sw_name);
1805	free(vsw);
1806}
1807
1808struct vmd_switch *
1809switch_getbyname(const char *name)
1810{
1811	struct vmd_switch	*vsw;
1812
1813	if (name == NULL)
1814		return (NULL);
1815	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1816		if (strcmp(vsw->sw_name, name) == 0)
1817			return (vsw);
1818	}
1819
1820	return (NULL);
1821}
1822
1823struct vmd_user *
1824user_get(uid_t uid)
1825{
1826	struct vmd_user		*usr;
1827
1828	if (uid == 0)
1829		return (NULL);
1830
1831	/* first try to find an existing user */
1832	TAILQ_FOREACH(usr, env->vmd_users, usr_entry) {
1833		if (usr->usr_id.uid == uid)
1834			goto done;
1835	}
1836
1837	if ((usr = calloc(1, sizeof(*usr))) == NULL) {
1838		log_warn("could not allocate user");
1839		return (NULL);
1840	}
1841
1842	usr->usr_id.uid = uid;
1843	usr->usr_id.gid = -1;
1844	TAILQ_INSERT_TAIL(env->vmd_users, usr, usr_entry);
1845
1846 done:
1847	DPRINTF("%s: uid %d #%d +",
1848	    __func__, usr->usr_id.uid, usr->usr_refcnt + 1);
1849	usr->usr_refcnt++;
1850
1851	return (usr);
1852}
1853
1854void
1855user_put(struct vmd_user *usr)
1856{
1857	if (usr == NULL)
1858		return;
1859
1860	DPRINTF("%s: uid %d #%d -",
1861	    __func__, usr->usr_id.uid, usr->usr_refcnt - 1);
1862
1863	if (--usr->usr_refcnt > 0)
1864		return;
1865
1866	TAILQ_REMOVE(env->vmd_users, usr, usr_entry);
1867	free(usr);
1868}
1869
1870void
1871user_inc(struct vm_create_params *vcp, struct vmd_user *usr, int inc)
1872{
1873	char	 mem[FMT_SCALED_STRSIZE];
1874
1875	if (usr == NULL)
1876		return;
1877
1878	/* increment or decrement counters */
1879	inc = inc ? 1 : -1;
1880
1881	usr->usr_maxcpu += vcp->vcp_ncpus * inc;
1882	usr->usr_maxmem += vcp->vcp_memranges[0].vmr_size * inc;
1883	usr->usr_maxifs += vcp->vcp_nnics * inc;
1884
1885	if (log_getverbose() > 1) {
1886		(void)fmt_scaled(usr->usr_maxmem * 1024 * 1024, mem);
1887		log_debug("%s: %c uid %d ref %d cpu %llu mem %s ifs %llu",
1888		    __func__, inc == 1 ? '+' : '-',
1889		    usr->usr_id.uid, usr->usr_refcnt,
1890		    usr->usr_maxcpu, mem, usr->usr_maxifs);
1891	}
1892}
1893
1894int
1895user_checklimit(struct vmd_user *usr, struct vm_create_params *vcp)
1896{
1897	const char	*limit = "";
1898
1899	/* XXX make the limits configurable */
1900	if (usr->usr_maxcpu > VM_DEFAULT_USER_MAXCPU) {
1901		limit = "cpu ";
1902		goto fail;
1903	}
1904	if (usr->usr_maxmem > VM_DEFAULT_USER_MAXMEM) {
1905		limit = "memory ";
1906		goto fail;
1907	}
1908	if (usr->usr_maxifs > VM_DEFAULT_USER_MAXIFS) {
1909		limit = "interface ";
1910		goto fail;
1911	}
1912
1913	return (0);
1914
1915 fail:
1916	log_warnx("%s: user %d %slimit reached", vcp->vcp_name,
1917	    usr->usr_id.uid, limit);
1918	return (-1);
1919}
1920
1921char *
1922get_string(uint8_t *ptr, size_t len)
1923{
1924	size_t	 i;
1925
1926	for (i = 0; i < len; i++)
1927		if (!isprint(ptr[i]))
1928			break;
1929
1930	return strndup(ptr, i);
1931}
1932
1933uint32_t
1934prefixlen2mask(uint8_t prefixlen)
1935{
1936	if (prefixlen == 0)
1937		return (0);
1938
1939	if (prefixlen > 32)
1940		prefixlen = 32;
1941
1942	return (htonl(0xffffffff << (32 - prefixlen)));
1943}
1944
1945void
1946prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask)
1947{
1948	struct in6_addr	 s6;
1949	int		 i;
1950
1951	if (prefixlen > 128)
1952		prefixlen = 128;
1953
1954	memset(&s6, 0, sizeof(s6));
1955	for (i = 0; i < prefixlen / 8; i++)
1956		s6.s6_addr[i] = 0xff;
1957	i = prefixlen % 8;
1958	if (i)
1959		s6.s6_addr[prefixlen / 8] = 0xff00 >> i;
1960
1961	memcpy(mask, &s6, sizeof(s6));
1962}
1963
1964void
1965getmonotime(struct timeval *tv)
1966{
1967	struct timespec	 ts;
1968
1969	if (clock_gettime(CLOCK_MONOTONIC, &ts))
1970		fatal("clock_gettime");
1971
1972	TIMESPEC_TO_TIMEVAL(tv, &ts);
1973}
1974