vmd.c revision 1.109
1/*	$OpenBSD: vmd.c,v 1.109 2019/05/11 01:05:17 jasper Exp $	*/
2
3/*
4 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/param.h>	/* nitems */
20#include <sys/queue.h>
21#include <sys/wait.h>
22#include <sys/cdefs.h>
23#include <sys/stat.h>
24#include <sys/tty.h>
25#include <sys/ttycom.h>
26#include <sys/ioctl.h>
27
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31#include <termios.h>
32#include <errno.h>
33#include <event.h>
34#include <fcntl.h>
35#include <pwd.h>
36#include <signal.h>
37#include <syslog.h>
38#include <unistd.h>
39#include <util.h>
40#include <ctype.h>
41#include <pwd.h>
42#include <grp.h>
43
44#include <machine/specialreg.h>
45#include <machine/vmmvar.h>
46
47#include "proc.h"
48#include "atomicio.h"
49#include "vmd.h"
50
51__dead void usage(void);
52
53int	 main(int, char **);
54int	 vmd_configure(void);
55void	 vmd_sighdlr(int sig, short event, void *arg);
56void	 vmd_shutdown(void);
57int	 vmd_control_run(void);
58int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
59int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
60int	 vmd_check_vmh(struct vm_dump_header *);
61
62int	 vm_instance(struct privsep *, struct vmd_vm **,
63	    struct vmop_create_params *, uid_t);
64int	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
65uint32_t vm_claimid(const char *, int);
66
67struct vmd	*env;
68
69static struct privsep_proc procs[] = {
70	/* Keep "priv" on top as procs[0] */
71	{ "priv",	PROC_PRIV,	NULL, priv },
72	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
73	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm, vmm_shutdown },
74};
75
76/* For the privileged process */
77static struct privsep_proc *proc_priv = &procs[0];
78static struct passwd proc_privpw;
79static const uint8_t zero_mac[ETHER_ADDR_LEN];
80
81int
82vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
83{
84	struct privsep			*ps = p->p_ps;
85	int				 res = 0, ret = 0, cmd = 0, verbose;
86	unsigned int			 v = 0, flags;
87	struct vmop_create_params	 vmc;
88	struct vmop_id			 vid;
89	struct vmop_result		 vmr;
90	struct vm_dump_header		 vmh;
91	struct vmd_vm			*vm = NULL;
92	char				*str = NULL;
93	uint32_t			 id = 0;
94	struct control_sock		*rcs;
95
96	switch (imsg->hdr.type) {
97	case IMSG_VMDOP_START_VM_REQUEST:
98		IMSG_SIZE_CHECK(imsg, &vmc);
99		memcpy(&vmc, imsg->data, sizeof(vmc));
100		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
101		if (vmc.vmc_flags == 0) {
102			/* start an existing VM with pre-configured options */
103			if (!(ret == -1 && errno == EALREADY &&
104			    vm->vm_running == 0)) {
105				res = errno;
106				cmd = IMSG_VMDOP_START_VM_RESPONSE;
107			}
108		} else if (ret != 0) {
109			res = errno;
110			cmd = IMSG_VMDOP_START_VM_RESPONSE;
111		}
112		if (res == 0 &&
113		    config_setvm(ps, vm,
114		    imsg->hdr.peerid, vm->vm_params.vmc_owner.uid) == -1) {
115			res = errno;
116			cmd = IMSG_VMDOP_START_VM_RESPONSE;
117		}
118		break;
119	case IMSG_VMDOP_WAIT_VM_REQUEST:
120	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
121		IMSG_SIZE_CHECK(imsg, &vid);
122		memcpy(&vid, imsg->data, sizeof(vid));
123		flags = vid.vid_flags;
124
125		if ((id = vid.vid_id) == 0) {
126			/* Lookup vm (id) by name */
127			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
128				res = ENOENT;
129				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
130				break;
131			} else if (vm->vm_shutdown &&
132			    (flags & VMOP_FORCE) == 0) {
133				res = EALREADY;
134				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
135				break;
136			} else if (vm->vm_running == 0) {
137				res = EINVAL;
138				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
139				break;
140			}
141			id = vm->vm_vmid;
142		} else if ((vm = vm_getbyvmid(id)) == NULL) {
143			res = ENOENT;
144			cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
145			break;
146		}
147		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
148		    vid.vid_uid) != 0) {
149			res = EPERM;
150			cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
151			break;
152		}
153
154		memset(&vid, 0, sizeof(vid));
155		vid.vid_id = id;
156		vid.vid_flags = flags;
157		if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
158		    imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
159			return (-1);
160		break;
161	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
162		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
163		break;
164	case IMSG_VMDOP_LOAD:
165		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
166		str = get_string((uint8_t *)imsg->data,
167		    IMSG_DATA_SIZE(imsg));
168	case IMSG_VMDOP_RELOAD:
169		if (vmd_reload(0, str) == -1)
170			cmd = IMSG_CTL_FAIL;
171		else
172			cmd = IMSG_CTL_OK;
173		free(str);
174		break;
175	case IMSG_CTL_RESET:
176		IMSG_SIZE_CHECK(imsg, &v);
177		memcpy(&v, imsg->data, sizeof(v));
178		if (vmd_reload(v, NULL) == -1)
179			cmd = IMSG_CTL_FAIL;
180		else
181			cmd = IMSG_CTL_OK;
182		break;
183	case IMSG_CTL_VERBOSE:
184		IMSG_SIZE_CHECK(imsg, &verbose);
185		memcpy(&verbose, imsg->data, sizeof(verbose));
186		log_setverbose(verbose);
187
188		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
189		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
190		cmd = IMSG_CTL_OK;
191		break;
192	case IMSG_VMDOP_PAUSE_VM:
193	case IMSG_VMDOP_UNPAUSE_VM:
194		IMSG_SIZE_CHECK(imsg, &vid);
195		memcpy(&vid, imsg->data, sizeof(vid));
196		if (vid.vid_id == 0) {
197			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
198				res = ENOENT;
199				cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
200				break;
201			} else {
202				vid.vid_id = vm->vm_vmid;
203			}
204		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
205			res = ENOENT;
206			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
207			break;
208		}
209		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
210		    vid.vid_uid) != 0) {
211			res = EPERM;
212			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
213			break;
214		}
215		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
216		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
217		break;
218	case IMSG_VMDOP_SEND_VM_REQUEST:
219		IMSG_SIZE_CHECK(imsg, &vid);
220		memcpy(&vid, imsg->data, sizeof(vid));
221		id = vid.vid_id;
222		if (vid.vid_id == 0) {
223			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
224				res = ENOENT;
225				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
226				close(imsg->fd);
227				break;
228			} else {
229				vid.vid_id = vm->vm_vmid;
230			}
231		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
232			res = ENOENT;
233			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
234			close(imsg->fd);
235			break;
236		} else {
237		}
238		vmr.vmr_id = vid.vid_id;
239		log_debug("%s: sending fd to vmm", __func__);
240		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
241		    imsg->hdr.peerid, imsg->fd, &vid, sizeof(vid));
242		break;
243	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
244		IMSG_SIZE_CHECK(imsg, &vid);
245		memcpy(&vid, imsg->data, sizeof(vid));
246		if (imsg->fd == -1) {
247			log_warnx("%s: invalid fd", __func__);
248			return (-1);
249		}
250		if (atomicio(read, imsg->fd, &vmh, sizeof(vmh)) !=
251		    sizeof(vmh)) {
252			log_warnx("%s: error reading vmh from received vm",
253			    __func__);
254			res = EIO;
255			close(imsg->fd);
256			cmd = IMSG_VMDOP_START_VM_RESPONSE;
257			break;
258		}
259
260		if (vmd_check_vmh(&vmh)) {
261			res = ENOENT;
262			close(imsg->fd);
263			cmd = IMSG_VMDOP_START_VM_RESPONSE;
264			break;
265		}
266		if (atomicio(read, imsg->fd, &vmc, sizeof(vmc)) !=
267		    sizeof(vmc)) {
268			log_warnx("%s: error reading vmc from received vm",
269			    __func__);
270			res = EIO;
271			close(imsg->fd);
272			cmd = IMSG_VMDOP_START_VM_RESPONSE;
273			break;
274		}
275		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
276		    sizeof(vmc.vmc_params.vcp_name));
277		vmc.vmc_params.vcp_id = 0;
278
279		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
280		if (ret != 0) {
281			res = errno;
282			cmd = IMSG_VMDOP_START_VM_RESPONSE;
283			close(imsg->fd);
284		} else {
285			vm->vm_received = 1;
286			config_setvm(ps, vm, imsg->hdr.peerid,
287			    vmc.vmc_owner.uid);
288			log_debug("%s: sending fd to vmm", __func__);
289			proc_compose_imsg(ps, PROC_VMM, -1,
290			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, imsg->fd,
291			    NULL, 0);
292		}
293		break;
294	case IMSG_VMDOP_DONE:
295		control_reset(&ps->ps_csock);
296		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
297			control_reset(rcs);
298		cmd = 0;
299		break;
300	default:
301		return (-1);
302	}
303
304	switch (cmd) {
305	case 0:
306		break;
307	case IMSG_VMDOP_START_VM_RESPONSE:
308	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
309		memset(&vmr, 0, sizeof(vmr));
310		vmr.vmr_result = res;
311		vmr.vmr_id = id;
312		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
313		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
314			return (-1);
315		break;
316	default:
317		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
318		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
319			return (-1);
320		break;
321	}
322
323	return (0);
324}
325
326int
327vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
328{
329	struct vmop_result	 vmr;
330	struct privsep		*ps = p->p_ps;
331	int			 res = 0;
332	struct vmd_vm		*vm;
333	struct vm_create_params	*vcp;
334	struct vmop_info_result	 vir;
335
336	switch (imsg->hdr.type) {
337	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
338		IMSG_SIZE_CHECK(imsg, &vmr);
339		memcpy(&vmr, imsg->data, sizeof(vmr));
340		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
341			break;
342		proc_compose_imsg(ps, PROC_CONTROL, -1,
343		    imsg->hdr.type, imsg->hdr.peerid, -1,
344		    imsg->data, sizeof(imsg->data));
345		log_info("%s: paused vm %d successfully",
346		    vm->vm_params.vmc_params.vcp_name,
347		    vm->vm_vmid);
348		vm->vm_paused = 1;
349		break;
350	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
351		IMSG_SIZE_CHECK(imsg, &vmr);
352		memcpy(&vmr, imsg->data, sizeof(vmr));
353		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
354			break;
355		proc_compose_imsg(ps, PROC_CONTROL, -1,
356		    imsg->hdr.type, imsg->hdr.peerid, -1,
357		    imsg->data, sizeof(imsg->data));
358		log_info("%s: unpaused vm %d successfully.",
359		    vm->vm_params.vmc_params.vcp_name,
360		    vm->vm_vmid);
361		vm->vm_paused = 0;
362		break;
363	case IMSG_VMDOP_START_VM_RESPONSE:
364		IMSG_SIZE_CHECK(imsg, &vmr);
365		memcpy(&vmr, imsg->data, sizeof(vmr));
366		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
367			break;
368		vm->vm_pid = vmr.vmr_pid;
369		vcp = &vm->vm_params.vmc_params;
370		vcp->vcp_id = vmr.vmr_id;
371
372		/*
373		 * If the peerid is not -1, forward the response back to the
374		 * the control socket.  If it is -1, the request originated
375		 * from the parent, not the control socket.
376		 */
377		if (vm->vm_peerid != (uint32_t)-1) {
378			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
379			    sizeof(vmr.vmr_ttyname));
380			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
381			    imsg->hdr.type, vm->vm_peerid, -1,
382			    &vmr, sizeof(vmr)) == -1) {
383				errno = vmr.vmr_result;
384				log_warn("%s: failed to foward vm result",
385				    vcp->vcp_name);
386				vm_remove(vm, __func__);
387				return (-1);
388			}
389		}
390
391		if (vmr.vmr_result) {
392			errno = vmr.vmr_result;
393			log_warn("%s: failed to start vm", vcp->vcp_name);
394			vm_remove(vm, __func__);
395			break;
396		}
397
398		/* Now configure all the interfaces */
399		if (vm_priv_ifconfig(ps, vm) == -1) {
400			log_warn("%s: failed to configure vm", vcp->vcp_name);
401			vm_remove(vm, __func__);
402			break;
403		}
404
405		log_info("%s: started vm %d successfully, tty %s",
406		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
407		break;
408	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
409		IMSG_SIZE_CHECK(imsg, &vmr);
410		memcpy(&vmr, imsg->data, sizeof(vmr));
411		DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
412		    __func__, vmr.vmr_id);
413		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
414		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
415			break;
416		if (vmr.vmr_result == 0) {
417			/* Mark VM as shutting down */
418			vm->vm_shutdown = 1;
419		}
420		break;
421	case IMSG_VMDOP_SEND_VM_RESPONSE:
422		IMSG_SIZE_CHECK(imsg, &vmr);
423		memcpy(&vmr, imsg->data, sizeof(vmr));
424		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
425			break;
426		if (!vmr.vmr_result) {
427			log_info("%s: sent vm %d successfully.",
428			    vm->vm_params.vmc_params.vcp_name,
429			    vm->vm_vmid);
430			if (vm->vm_from_config)
431				vm_stop(vm, 0, __func__);
432			else
433				vm_remove(vm, __func__);
434		}
435
436		/* Send a response if a control client is waiting for it */
437		if (imsg->hdr.peerid != (uint32_t)-1) {
438			/* the error is meaningless for deferred responses */
439			vmr.vmr_result = 0;
440
441			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
442			    IMSG_VMDOP_SEND_VM_RESPONSE,
443			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
444				return (-1);
445		}
446		break;
447	case IMSG_VMDOP_TERMINATE_VM_EVENT:
448		IMSG_SIZE_CHECK(imsg, &vmr);
449		memcpy(&vmr, imsg->data, sizeof(vmr));
450		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
451		    __func__, vmr.vmr_id, vmr.vmr_result);
452		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
453			log_debug("%s: vm %d is no longer available",
454			    __func__, vmr.vmr_id);
455			break;
456		}
457		if (vmr.vmr_result != EAGAIN ||
458		    vm->vm_params.vmc_bootdevice) {
459			if (vm->vm_from_config)
460				vm_stop(vm, 0, __func__);
461			else
462				vm_remove(vm, __func__);
463		} else {
464			/* Stop VM instance but keep the tty open */
465			vm_stop(vm, 1, __func__);
466			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
467		}
468
469		/* Send a response if a control client is waiting for it */
470		if (imsg->hdr.peerid != (uint32_t)-1) {
471			/* the error is meaningless for deferred responses */
472			vmr.vmr_result = 0;
473
474			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
475			    IMSG_VMDOP_TERMINATE_VM_RESPONSE,
476			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
477				return (-1);
478		}
479		break;
480	case IMSG_VMDOP_GET_INFO_VM_DATA:
481		IMSG_SIZE_CHECK(imsg, &vir);
482		memcpy(&vir, imsg->data, sizeof(vir));
483		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
484			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
485			if (vm->vm_ttyname != NULL)
486				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
487				    sizeof(vir.vir_ttyname));
488			if (vm->vm_shutdown) {
489				/* XXX there might be a nicer way */
490				(void)strlcat(vir.vir_info.vir_name,
491				    " - stopping",
492				    sizeof(vir.vir_info.vir_name));
493			}
494			/* get the user id who started the vm */
495			vir.vir_uid = vm->vm_uid;
496			vir.vir_gid = vm->vm_params.vmc_owner.gid;
497		}
498		if (proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type,
499		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
500			log_debug("%s: GET_INFO_VM failed for vm %d, removing",
501			    __func__, vm->vm_vmid);
502			vm_remove(vm, __func__);
503			return (-1);
504		}
505		break;
506	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
507		/*
508		 * PROC_VMM has responded with the *running* VMs, now we
509		 * append the others. These use the special value 0 for their
510		 * kernel id to indicate that they are not running.
511		 */
512		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
513			if (!vm->vm_running) {
514				memset(&vir, 0, sizeof(vir));
515				vir.vir_info.vir_id = vm->vm_vmid;
516				strlcpy(vir.vir_info.vir_name,
517				    vm->vm_params.vmc_params.vcp_name,
518				    VMM_MAX_NAME_LEN);
519				vir.vir_info.vir_memory_size =
520				    vm->vm_params.vmc_params.
521				    vcp_memranges[0].vmr_size;
522				vir.vir_info.vir_ncpus =
523				    vm->vm_params.vmc_params.vcp_ncpus;
524				/* get the configured user id for this vm */
525				vir.vir_uid = vm->vm_params.vmc_owner.uid;
526				vir.vir_gid = vm->vm_params.vmc_owner.gid;
527				if (proc_compose_imsg(ps, PROC_CONTROL, -1,
528				    IMSG_VMDOP_GET_INFO_VM_DATA,
529				    imsg->hdr.peerid, -1, &vir,
530				    sizeof(vir)) == -1) {
531					log_debug("%s: GET_INFO_VM_END failed",
532					    __func__);
533					vm_remove(vm, __func__);
534					return (-1);
535				}
536			}
537		}
538		IMSG_SIZE_CHECK(imsg, &res);
539		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
540		break;
541	default:
542		return (-1);
543	}
544
545	return (0);
546}
547
548int
549vmd_check_vmh(struct vm_dump_header *vmh)
550{
551	int i;
552	unsigned int code, leaf;
553	unsigned int a, b, c, d;
554
555
556	if (vmh->vmh_version != VM_DUMP_VERSION) {
557		log_warnx("%s: incompatible dump version", __func__);
558		return (-1);
559	}
560
561	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
562		code = vmh->vmh_cpuids[i].code;
563		leaf = vmh->vmh_cpuids[i].leaf;
564		if (leaf != 0x00) {
565			log_debug("%s: invalid leaf 0x%x for code 0x%x",
566			    __func__, leaf, code);
567			return (-1);
568		}
569
570		switch (code) {
571		case 0x00:
572			CPUID_LEAF(code, leaf, a, b, c, d);
573			if (vmh->vmh_cpuids[i].a > a) {
574				log_debug("%s: incompatible cpuid level",
575				    __func__);
576				return (-1);
577			}
578			if (!(vmh->vmh_cpuids[i].b == b &&
579			    vmh->vmh_cpuids[i].c == c &&
580			    vmh->vmh_cpuids[i].d == d)) {
581				log_debug("%s: incompatible cpu brand",
582				    __func__);
583				return (-1);
584			}
585			break;
586
587		case 0x01:
588			CPUID_LEAF(code, leaf, a, b, c, d);
589			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
590			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
591				log_debug("%s: incompatible cpu features "
592				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
593				    code, leaf);
594				return (-1);
595			}
596			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
597			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
598				log_debug("%s: incompatible cpu features "
599				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
600				    code, leaf);
601				return (-1);
602			}
603			break;
604
605		case 0x07:
606			CPUID_LEAF(code, leaf, a, b, c, d);
607			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
608			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
609				log_debug("%s: incompatible cpu features "
610				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
611				    code, leaf);
612				return (-1);
613			}
614			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
615			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
616				log_debug("%s: incompatible cpu features "
617				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
618				    code, leaf);
619				return (-1);
620			}
621			break;
622
623		case 0x0d:
624			CPUID_LEAF(code, leaf, a, b, c, d);
625			if (vmh->vmh_cpuids[i].b > b) {
626				log_debug("%s: incompatible cpu: insufficient "
627				    "max save area for enabled XCR0 features",
628				    __func__);
629				return (-1);
630			}
631			if (vmh->vmh_cpuids[i].c > c) {
632				log_debug("%s: incompatible cpu: insufficient "
633				    "max save area for supported XCR0 features",
634				    __func__);
635				return (-1);
636			}
637			break;
638
639		case 0x80000001:
640			CPUID_LEAF(code, leaf, a, b, c, d);
641			if ((vmh->vmh_cpuids[i].a & a) !=
642			    vmh->vmh_cpuids[i].a) {
643				log_debug("%s: incompatible cpu features "
644				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
645				    code, leaf);
646				return (-1);
647			}
648			if ((vmh->vmh_cpuids[i].c & c) !=
649			    vmh->vmh_cpuids[i].c) {
650				log_debug("%s: incompatible cpu features "
651				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
652				    code, leaf);
653				return (-1);
654			}
655			if ((vmh->vmh_cpuids[i].d & d) !=
656			    vmh->vmh_cpuids[i].d) {
657				log_debug("%s: incompatible cpu features "
658				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
659				    code, leaf);
660				return (-1);
661			}
662			break;
663
664		default:
665			log_debug("%s: unknown code 0x%x", __func__, code);
666			return (-1);
667		}
668	}
669
670	return (0);
671}
672
673void
674vmd_sighdlr(int sig, short event, void *arg)
675{
676	if (privsep_process != PROC_PARENT)
677		return;
678	log_debug("%s: handling signal", __func__);
679
680	switch (sig) {
681	case SIGHUP:
682		log_info("%s: reload requested with SIGHUP", __func__);
683
684		/*
685		 * This is safe because libevent uses async signal handlers
686		 * that run in the event loop and not in signal context.
687		 */
688		(void)vmd_reload(0, NULL);
689		break;
690	case SIGPIPE:
691		log_info("%s: ignoring SIGPIPE", __func__);
692		break;
693	case SIGUSR1:
694		log_info("%s: ignoring SIGUSR1", __func__);
695		break;
696	case SIGTERM:
697	case SIGINT:
698		vmd_shutdown();
699		break;
700	default:
701		fatalx("unexpected signal");
702	}
703}
704
705__dead void
706usage(void)
707{
708	extern char *__progname;
709	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
710	    __progname);
711	exit(1);
712}
713
714int
715main(int argc, char **argv)
716{
717	struct privsep		*ps;
718	int			 ch;
719	const char		*conffile = VMD_CONF;
720	enum privsep_procid	 proc_id = PROC_PARENT;
721	int			 proc_instance = 0;
722	const char		*errp, *title = NULL;
723	int			 argc0 = argc;
724
725	log_init(0, LOG_DAEMON);
726
727	if ((env = calloc(1, sizeof(*env))) == NULL)
728		fatal("calloc: env");
729
730	while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) {
731		switch (ch) {
732		case 'D':
733			if (cmdline_symset(optarg) < 0)
734				log_warnx("could not parse macro definition %s",
735				    optarg);
736			break;
737		case 'd':
738			env->vmd_debug = 2;
739			break;
740		case 'f':
741			conffile = optarg;
742			break;
743		case 'v':
744			env->vmd_verbose++;
745			break;
746		case 'n':
747			env->vmd_noaction = 1;
748			break;
749		case 'P':
750			title = optarg;
751			proc_id = proc_getid(procs, nitems(procs), title);
752			if (proc_id == PROC_MAX)
753				fatalx("invalid process name");
754			break;
755		case 'I':
756			proc_instance = strtonum(optarg, 0,
757			    PROC_MAX_INSTANCES, &errp);
758			if (errp)
759				fatalx("invalid process instance");
760			break;
761		default:
762			usage();
763		}
764	}
765
766	argc -= optind;
767	if (argc > 0)
768		usage();
769
770	if (env->vmd_noaction && !env->vmd_debug)
771		env->vmd_debug = 1;
772
773	/* check for root privileges */
774	if (env->vmd_noaction == 0) {
775		if (geteuid())
776			fatalx("need root privileges");
777	}
778
779	ps = &env->vmd_ps;
780	ps->ps_env = env;
781	env->vmd_fd = -1;
782
783	if (config_init(env) == -1)
784		fatal("failed to initialize configuration");
785
786	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
787		fatal("unknown user %s", VMD_USER);
788
789	/* First proc runs as root without pledge but in default chroot */
790	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
791	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
792
793	/* Open /dev/vmm */
794	if (env->vmd_noaction == 0) {
795		env->vmd_fd = open(VMM_NODE, O_RDWR);
796		if (env->vmd_fd == -1)
797			fatal("%s", VMM_NODE);
798	}
799
800	/* Configure the control socket */
801	ps->ps_csock.cs_name = SOCKET_NAME;
802	TAILQ_INIT(&ps->ps_rcsocks);
803
804	/* Configuration will be parsed after forking the children */
805	env->vmd_conffile = conffile;
806
807	log_init(env->vmd_debug, LOG_DAEMON);
808	log_setverbose(env->vmd_verbose);
809
810	if (env->vmd_noaction)
811		ps->ps_noaction = 1;
812	ps->ps_instance = proc_instance;
813	if (title != NULL)
814		ps->ps_title[proc_id] = title;
815
816	/* only the parent returns */
817	proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv,
818	    proc_id);
819
820	log_procinit("parent");
821	if (!env->vmd_debug && daemon(0, 0) == -1)
822		fatal("can't daemonize");
823
824	if (ps->ps_noaction == 0)
825		log_info("startup");
826
827	event_init();
828
829	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
830	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
831	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
832	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
833	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
834
835	signal_add(&ps->ps_evsigint, NULL);
836	signal_add(&ps->ps_evsigterm, NULL);
837	signal_add(&ps->ps_evsighup, NULL);
838	signal_add(&ps->ps_evsigpipe, NULL);
839	signal_add(&ps->ps_evsigusr1, NULL);
840
841	if (!env->vmd_noaction)
842		proc_connect(ps);
843
844	if (vmd_configure() == -1)
845		fatalx("configuration failed");
846
847	event_dispatch();
848
849	log_debug("parent exiting");
850
851	return (0);
852}
853
854int
855vmd_configure(void)
856{
857	struct vmd_vm		*vm;
858	struct vmd_switch	*vsw;
859
860	if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
861		fatal("open %s", PATH_PTMDEV);
862
863	/*
864	 * pledge in the parent process:
865	 * stdio - for malloc and basic I/O including events.
866	 * rpath - for reload to open and read the configuration files.
867	 * wpath - for opening disk images and tap devices.
868	 * tty - for openpty and TIOCUCNTL.
869	 * proc - run kill to terminate its children safely.
870	 * sendfd - for disks, interfaces and other fds.
871	 * recvfd - for send and receive.
872	 * getpw - lookup user or group id by name.
873	 * chown, fattr - change tty ownership
874	 * flock - locking disk files
875	 */
876	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
877	    " chown fattr flock", NULL) == -1)
878		fatal("pledge");
879
880	if (parse_config(env->vmd_conffile) == -1) {
881		proc_kill(&env->vmd_ps);
882		exit(1);
883	}
884
885	if (env->vmd_noaction) {
886		fprintf(stderr, "configuration OK\n");
887		proc_kill(&env->vmd_ps);
888		exit(0);
889	}
890
891	/* Send shared global configuration to all children */
892	if (config_setconfig(env) == -1)
893		return (-1);
894
895	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
896		if (vsw->sw_running)
897			continue;
898		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
899			log_warn("%s: failed to create switch %s",
900			    __func__, vsw->sw_name);
901			switch_remove(vsw);
902			return (-1);
903		}
904	}
905
906	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
907		if (vm->vm_disabled) {
908			log_debug("%s: not creating vm %s (disabled)",
909			    __func__,
910			    vm->vm_params.vmc_params.vcp_name);
911			continue;
912		}
913		if (config_setvm(&env->vmd_ps, vm,
914		    -1, vm->vm_params.vmc_owner.uid) == -1)
915			return (-1);
916	}
917
918	return (0);
919}
920
921int
922vmd_reload(unsigned int reset, const char *filename)
923{
924	struct vmd_vm		*vm, *next_vm;
925	struct vmd_switch	*vsw;
926	int			 reload = 0;
927
928	/* Switch back to the default config file */
929	if (filename == NULL || *filename == '\0') {
930		filename = env->vmd_conffile;
931		reload = 1;
932	}
933
934	log_debug("%s: level %d config file %s", __func__, reset, filename);
935
936	if (reset) {
937		/* Purge the configuration */
938		config_purge(env, reset);
939		config_setreset(env, reset);
940	} else {
941		/*
942		 * Load or reload the configuration.
943		 *
944		 * Reloading removes all non-running VMs before processing the
945		 * config file, whereas loading only adds to the existing list
946		 * of VMs.
947		 */
948
949		if (reload) {
950			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
951			    next_vm) {
952				if (vm->vm_running == 0) {
953					DPRINTF("%s: calling vm_remove",
954					    __func__);
955					vm_remove(vm, __func__);
956				}
957			}
958		}
959
960		if (parse_config(filename) == -1) {
961			log_debug("%s: failed to load config file %s",
962			    __func__, filename);
963			return (-1);
964		}
965
966		if (reload) {
967			/* Update shared global configuration in all children */
968			if (config_setconfig(env) == -1)
969				return (-1);
970		}
971
972		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
973			if (vsw->sw_running)
974				continue;
975			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
976				log_warn("%s: failed to create switch %s",
977				    __func__, vsw->sw_name);
978				switch_remove(vsw);
979				return (-1);
980			}
981		}
982
983		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
984			if (vm->vm_running == 0) {
985				if (vm->vm_disabled) {
986					log_debug("%s: not creating vm %s"
987					    " (disabled)", __func__,
988					    vm->vm_params.vmc_params.vcp_name);
989					continue;
990				}
991				if (config_setvm(&env->vmd_ps, vm,
992				    -1, vm->vm_params.vmc_owner.uid) == -1)
993					return (-1);
994			} else {
995				log_debug("%s: not creating vm \"%s\": "
996				    "(running)", __func__,
997				    vm->vm_params.vmc_params.vcp_name);
998			}
999		}
1000	}
1001
1002	return (0);
1003}
1004
1005void
1006vmd_shutdown(void)
1007{
1008	struct vmd_vm *vm, *vm_next;
1009
1010	log_debug("%s: performing shutdown", __func__);
1011
1012	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
1013		vm_remove(vm, __func__);
1014	}
1015
1016	proc_kill(&env->vmd_ps);
1017	free(env);
1018
1019	log_warnx("parent terminating");
1020	exit(0);
1021}
1022
1023struct vmd_vm *
1024vm_getbyvmid(uint32_t vmid)
1025{
1026	struct vmd_vm	*vm;
1027
1028	if (vmid == 0)
1029		return (NULL);
1030	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1031		if (vm->vm_vmid == vmid)
1032			return (vm);
1033	}
1034
1035	return (NULL);
1036}
1037
1038struct vmd_vm *
1039vm_getbyid(uint32_t id)
1040{
1041	struct vmd_vm	*vm;
1042
1043	if (id == 0)
1044		return (NULL);
1045	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1046		if (vm->vm_params.vmc_params.vcp_id == id)
1047			return (vm);
1048	}
1049
1050	return (NULL);
1051}
1052
1053uint32_t
1054vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1055{
1056	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1057		return (0);
1058	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1059	    id, vm->vm_vmid);
1060	return (vm->vm_vmid);
1061}
1062
1063uint32_t
1064vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1065{
1066	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1067		return (0);
1068	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1069	    vmid, vm->vm_params.vmc_params.vcp_id);
1070	return (vm->vm_params.vmc_params.vcp_id);
1071}
1072
1073struct vmd_vm *
1074vm_getbyname(const char *name)
1075{
1076	struct vmd_vm	*vm;
1077
1078	if (name == NULL)
1079		return (NULL);
1080	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1081		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1082			return (vm);
1083	}
1084
1085	return (NULL);
1086}
1087
1088struct vmd_vm *
1089vm_getbypid(pid_t pid)
1090{
1091	struct vmd_vm	*vm;
1092
1093	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1094		if (vm->vm_pid == pid)
1095			return (vm);
1096	}
1097
1098	return (NULL);
1099}
1100
1101void
1102vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1103{
1104	struct privsep	*ps = &env->vmd_ps;
1105	unsigned int	 i, j;
1106
1107	if (vm == NULL)
1108		return;
1109
1110	log_debug("%s: %s %s stopping vm %d%s",
1111	    __func__, ps->ps_title[privsep_process], caller,
1112	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1113
1114	vm->vm_running = 0;
1115	vm->vm_shutdown = 0;
1116
1117	user_inc(&vm->vm_params.vmc_params, vm->vm_user, 0);
1118	user_put(vm->vm_user);
1119
1120	if (vm->vm_iev.ibuf.fd != -1) {
1121		event_del(&vm->vm_iev.ev);
1122		close(vm->vm_iev.ibuf.fd);
1123	}
1124	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
1125		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
1126			if (vm->vm_disks[i][j] != -1) {
1127				close(vm->vm_disks[i][j]);
1128				vm->vm_disks[i][j] = -1;
1129			}
1130		}
1131	}
1132	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
1133		if (vm->vm_ifs[i].vif_fd != -1) {
1134			close(vm->vm_ifs[i].vif_fd);
1135			vm->vm_ifs[i].vif_fd = -1;
1136		}
1137		free(vm->vm_ifs[i].vif_name);
1138		free(vm->vm_ifs[i].vif_switch);
1139		free(vm->vm_ifs[i].vif_group);
1140		vm->vm_ifs[i].vif_name = NULL;
1141		vm->vm_ifs[i].vif_switch = NULL;
1142		vm->vm_ifs[i].vif_group = NULL;
1143	}
1144	if (vm->vm_kernel != -1) {
1145		close(vm->vm_kernel);
1146		vm->vm_kernel = -1;
1147	}
1148	if (vm->vm_cdrom != -1) {
1149		close(vm->vm_cdrom);
1150		vm->vm_cdrom = -1;
1151	}
1152	if (!keeptty) {
1153		vm_closetty(vm);
1154		vm->vm_uid = 0;
1155	}
1156}
1157
1158void
1159vm_remove(struct vmd_vm *vm, const char *caller)
1160{
1161	struct privsep	*ps = &env->vmd_ps;
1162
1163	if (vm == NULL)
1164		return;
1165
1166	log_debug("%s: %s %s removing vm %d from running config",
1167	    __func__, ps->ps_title[privsep_process], caller,
1168	    vm->vm_vmid);
1169
1170	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1171
1172	user_put(vm->vm_user);
1173	vm_stop(vm, 0, caller);
1174	free(vm);
1175}
1176
1177uint32_t
1178vm_claimid(const char *name, int uid)
1179{
1180	struct name2id *n2i = NULL;
1181
1182	TAILQ_FOREACH(n2i, env->vmd_known, entry)
1183		if (strcmp(n2i->name, name) == 0 && n2i->uid == uid)
1184			return n2i->id;
1185
1186	if (++env->vmd_nvm == 0)
1187		fatalx("too many vms");
1188	if ((n2i = calloc(1, sizeof(struct name2id))) == NULL)
1189		fatalx("could not alloc vm name");
1190	n2i->id = env->vmd_nvm;
1191	n2i->uid = uid;
1192	if (strlcpy(n2i->name, name, sizeof(n2i->name)) >= sizeof(n2i->name))
1193		fatalx("overlong vm name");
1194	TAILQ_INSERT_TAIL(env->vmd_known, n2i, entry);
1195
1196	return n2i->id;
1197}
1198
1199int
1200vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1201    struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1202{
1203	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1204	struct vm_create_params	*vcp = &vmc->vmc_params;
1205	struct vmop_owner	*vmo = NULL;
1206	struct vmd_user		*usr = NULL;
1207	uint32_t		 rng;
1208	unsigned int		 i, j;
1209	struct vmd_switch	*sw;
1210	char			*s;
1211
1212	/* Check if this is an instance of another VM */
1213	if (vm_instance(ps, &vm_parent, vmc, uid) == -1)
1214		return (-1);
1215
1216	errno = 0;
1217	*ret_vm = NULL;
1218
1219	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1220	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1221		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1222		    uid) != 0) {
1223			errno = EPERM;
1224			goto fail;
1225		}
1226		*ret_vm = vm;
1227		errno = EALREADY;
1228		goto fail;
1229	}
1230
1231	if (vm_parent != NULL)
1232		vmo = &vm_parent->vm_params.vmc_insowner;
1233
1234	/* non-root users can only start existing VMs or instances */
1235	if (vm_checkperm(NULL, vmo, uid) != 0) {
1236		log_warnx("permission denied");
1237		errno = EPERM;
1238		goto fail;
1239	}
1240	if (vmc->vmc_flags == 0) {
1241		log_warnx("invalid configuration, no devices");
1242		errno = VMD_DISK_MISSING;
1243		goto fail;
1244	}
1245	if (vcp->vcp_ncpus == 0)
1246		vcp->vcp_ncpus = 1;
1247	if (vcp->vcp_memranges[0].vmr_size == 0)
1248		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1249	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1250		log_warnx("invalid number of CPUs");
1251		goto fail;
1252	} else if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) {
1253		log_warnx("invalid number of disks");
1254		goto fail;
1255	} else if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) {
1256		log_warnx("invalid number of interfaces");
1257		goto fail;
1258	} else if (strlen(vcp->vcp_kernel) == 0 &&
1259	    vcp->vcp_ndisks == 0 && strlen(vcp->vcp_cdrom) == 0) {
1260		log_warnx("no kernel or disk/cdrom specified");
1261		goto fail;
1262	} else if (strlen(vcp->vcp_name) == 0) {
1263		log_warnx("invalid VM name");
1264		goto fail;
1265	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1266	    *vcp->vcp_name == '_') {
1267		log_warnx("invalid VM name");
1268		goto fail;
1269	} else {
1270		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1271			if (!(isalnum(*s) || *s == '.' || *s == '-' ||
1272			    *s == '_')) {
1273				log_warnx("invalid VM name");
1274				goto fail;
1275			}
1276		}
1277	}
1278
1279	/* track active users */
1280	if (uid != 0 && env->vmd_users != NULL &&
1281	    (usr = user_get(uid)) == NULL) {
1282		log_warnx("could not add user");
1283		goto fail;
1284	}
1285
1286	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1287		goto fail;
1288
1289	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1290	vmc = &vm->vm_params;
1291	vcp = &vmc->vmc_params;
1292	vm->vm_pid = -1;
1293	vm->vm_tty = -1;
1294	vm->vm_receive_fd = -1;
1295	vm->vm_paused = 0;
1296	vm->vm_user = usr;
1297
1298	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
1299		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
1300			vm->vm_disks[i][j] = -1;
1301	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
1302		vm->vm_ifs[i].vif_fd = -1;
1303	for (i = 0; i < vcp->vcp_nnics; i++) {
1304		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1305			/* inherit per-interface flags from the switch */
1306			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1307		}
1308
1309		/*
1310		 * If the MAC address is zero, always randomize it in vmd(8)
1311		 * because we cannot rely on the guest OS to do the right
1312		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1313		 * from the kernel, incremented by one to differentiate
1314		 * the source.
1315		 */
1316		if (memcmp(zero_mac, &vcp->vcp_macs[i], ETHER_ADDR_LEN) == 0) {
1317			rng = arc4random();
1318			vcp->vcp_macs[i][0] = 0xfe;
1319			vcp->vcp_macs[i][1] = 0xe1;
1320			vcp->vcp_macs[i][2] = 0xba + 1;
1321			vcp->vcp_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1322			vcp->vcp_macs[i][4] = rng;
1323			vcp->vcp_macs[i][5] = rng >> 8;
1324		}
1325	}
1326	vm->vm_kernel = -1;
1327	vm->vm_cdrom = -1;
1328	vm->vm_iev.ibuf.fd = -1;
1329
1330	/* Assign a new internal Id if not specified */
1331	vm->vm_vmid = (id == 0) ? vm_claimid(vcp->vcp_name, uid) : id;
1332
1333	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1334	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1335
1336	*ret_vm = vm;
1337	return (0);
1338 fail:
1339	if (errno == 0)
1340		errno = EINVAL;
1341	return (-1);
1342}
1343
1344int
1345vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1346    struct vmop_create_params *vmc, uid_t uid)
1347{
1348	char			*name;
1349	struct vm_create_params	*vcp = &vmc->vmc_params;
1350	struct vmop_create_params *vmcp;
1351	struct vm_create_params	*vcpp;
1352	struct vmd_vm		*vm = NULL;
1353	unsigned int		 i, j;
1354	uint32_t		 id;
1355
1356	/* return without error if the parent is NULL (nothing to inherit) */
1357	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1358	    (*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL)
1359		return (0);
1360
1361	errno = 0;
1362	vmcp = &(*vm_parent)->vm_params;
1363	vcpp = &vmcp->vmc_params;
1364
1365	/* Are we allowed to create an instance from this VM? */
1366	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1367		log_warnx("vm \"%s\" no permission to create vm instance",
1368		    vcpp->vcp_name);
1369		errno = ENAMETOOLONG;
1370		return (-1);
1371	}
1372
1373	id = vcp->vcp_id;
1374	name = vcp->vcp_name;
1375
1376	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1377	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1378		errno = EPROCLIM;
1379		return (-1);
1380	}
1381
1382	/* CPU */
1383	if (vcp->vcp_ncpus == 0)
1384		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1385	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1386	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1387		log_warnx("vm \"%s\" no permission to set cpus", name);
1388		errno = EPERM;
1389		return (-1);
1390	}
1391
1392	/* memory */
1393	if (vcp->vcp_memranges[0].vmr_size == 0)
1394		vcp->vcp_memranges[0].vmr_size =
1395		    vcpp->vcp_memranges[0].vmr_size;
1396	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1397	    vcp->vcp_memranges[0].vmr_size !=
1398	    vcpp->vcp_memranges[0].vmr_size) {
1399		log_warnx("vm \"%s\" no permission to set memory", name);
1400		errno = EPERM;
1401		return (-1);
1402	}
1403
1404	/* disks cannot be inherited */
1405	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1406	    vcp->vcp_ndisks) {
1407		log_warnx("vm \"%s\" no permission to set disks", name);
1408		errno = EPERM;
1409		return (-1);
1410	}
1411	for (i = 0; i < vcp->vcp_ndisks; i++) {
1412		/* Check if this disk is already used in the parent */
1413		for (j = 0; j < vcpp->vcp_ndisks; j++) {
1414			if (strcmp(vcp->vcp_disks[i],
1415			    vcpp->vcp_disks[j]) == 0) {
1416				log_warnx("vm \"%s\" disk %s cannot be reused",
1417				    name, vcp->vcp_disks[i]);
1418				errno = EBUSY;
1419				return (-1);
1420			}
1421		}
1422		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1423	}
1424
1425	/* interfaces */
1426	if (vcp->vcp_nnics > 0 &&
1427	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1428	    vcp->vcp_nnics != vcpp->vcp_nnics) {
1429		log_warnx("vm \"%s\" no permission to set interfaces", name);
1430		errno = EPERM;
1431		return (-1);
1432	}
1433	for (i = 0; i < vcpp->vcp_nnics; i++) {
1434		/* Interface got overwritten */
1435		if (i < vcp->vcp_nnics)
1436			continue;
1437
1438		/* Copy interface from parent */
1439		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1440		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1441		    sizeof(vmc->vmc_ifnames[i]));
1442		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1443		    sizeof(vmc->vmc_ifswitch[i]));
1444		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1445		    sizeof(vmc->vmc_ifgroup[i]));
1446		memcpy(vcp->vcp_macs[i], vcpp->vcp_macs[i],
1447		    sizeof(vcp->vcp_macs[i]));
1448		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1449		vcp->vcp_nnics++;
1450	}
1451	for (i = 0; i < vcp->vcp_nnics; i++) {
1452		for (j = 0; j < vcpp->vcp_nnics; j++) {
1453			if (memcmp(zero_mac, vcp->vcp_macs[i],
1454			    sizeof(vcp->vcp_macs[i])) != 0 &&
1455			    memcmp(vcpp->vcp_macs[i], vcp->vcp_macs[i],
1456			    sizeof(vcp->vcp_macs[i])) != 0) {
1457				log_warnx("vm \"%s\" lladdr cannot be reused",
1458				    name);
1459				errno = EBUSY;
1460				return (-1);
1461			}
1462			if (strlen(vmc->vmc_ifnames[i]) &&
1463			    strcmp(vmc->vmc_ifnames[i],
1464			    vmcp->vmc_ifnames[j]) == 0) {
1465				log_warnx("vm \"%s\" %s cannot be reused",
1466				    vmc->vmc_ifnames[i], name);
1467				errno = EBUSY;
1468				return (-1);
1469			}
1470		}
1471	}
1472
1473	/* kernel */
1474	if (strlen(vcp->vcp_kernel) > 0) {
1475		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1476			log_warnx("vm \"%s\" no permission to set boot image",
1477			    name);
1478			errno = EPERM;
1479			return (-1);
1480		}
1481		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1482	} else if (strlcpy(vcp->vcp_kernel, vcpp->vcp_kernel,
1483	    sizeof(vcp->vcp_kernel)) >= sizeof(vcp->vcp_kernel)) {
1484		log_warnx("vm \"%s\" kernel name too long", name);
1485		errno = EINVAL;
1486		return (-1);
1487	}
1488
1489	/* cdrom */
1490	if (strlen(vcp->vcp_cdrom) > 0) {
1491		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1492			log_warnx("vm \"%s\" no permission to set cdrom", name);
1493			errno = EPERM;
1494			return (-1);
1495		}
1496		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1497	} else if (strlcpy(vcp->vcp_cdrom, vcpp->vcp_cdrom,
1498	    sizeof(vcp->vcp_cdrom)) >= sizeof(vcp->vcp_cdrom)) {
1499		log_warnx("vm \"%s\" cdrom name too long", name);
1500		errno = EINVAL;
1501		return (-1);
1502	}
1503
1504	/* user */
1505	if (vmc->vmc_owner.uid == 0)
1506		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1507	else if (vmc->vmc_owner.uid != uid &&
1508	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1509		log_warnx("vm \"%s\" user mismatch", name);
1510		errno = EPERM;
1511		return (-1);
1512	}
1513
1514	/* group */
1515	if (vmc->vmc_owner.gid == 0)
1516		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1517	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1518		log_warnx("vm \"%s\" group mismatch", name);
1519		errno = EPERM;
1520		return (-1);
1521	}
1522
1523	/* child instances */
1524	if (vmc->vmc_insflags) {
1525		log_warnx("vm \"%s\" cannot change instance permissions", name);
1526		errno = EPERM;
1527		return (-1);
1528	}
1529	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1530		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1531		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1532		vmc->vmc_insflags = vmcp->vmc_insflags;
1533	} else {
1534		vmc->vmc_insowner.gid = 0;
1535		vmc->vmc_insowner.uid = 0;
1536		vmc->vmc_insflags = 0;
1537	}
1538
1539	/* finished, remove instance flags */
1540	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1541
1542	return (0);
1543}
1544
1545/*
1546 * vm_checkperm
1547 *
1548 * Checks if the user represented by the 'uid' parameter is allowed to
1549 * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1550 * console.)
1551 *
1552 * Parameters:
1553 *  vm: the VM whose permission is to be checked
1554 *  vmo: the required uid/gid to be checked
1555 *  uid: the user ID of the user making the request
1556 *
1557 * Return values:
1558 *   0: the permission should be granted
1559 *  -1: the permission check failed (also returned if vm == null)
1560 */
1561int
1562vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1563{
1564	struct group	*gr;
1565	struct passwd	*pw;
1566	char		**grmem;
1567
1568	/* root has no restrictions */
1569	if (uid == 0)
1570		return (0);
1571
1572	if (vmo == NULL)
1573		return (-1);
1574
1575	/* check user */
1576	if (vm == NULL) {
1577		if  (vmo->uid == uid)
1578			return (0);
1579	} else {
1580		/*
1581		 * check user of running vm (the owner of a running vm can
1582		 * be different to (or more specific than) the configured owner.
1583		 */
1584		if ((vm->vm_running && vm->vm_uid == uid) ||
1585		    (!vm->vm_running && vmo->uid == uid))
1586			return (0);
1587	}
1588
1589	/* check groups */
1590	if (vmo->gid != -1) {
1591		if ((pw = getpwuid(uid)) == NULL)
1592			return (-1);
1593		if (pw->pw_gid == vmo->gid)
1594			return (0);
1595		if ((gr = getgrgid(vmo->gid)) != NULL) {
1596			for (grmem = gr->gr_mem; *grmem; grmem++)
1597				if (strcmp(*grmem, pw->pw_name) == 0)
1598					return (0);
1599		}
1600	}
1601
1602	return (-1);
1603}
1604
1605/*
1606 * vm_checkinsflag
1607 *
1608 * Checks wheter the non-root user is allowed to set an instance option.
1609 *
1610 * Parameters:
1611 *  vmc: the VM create parameters
1612 *  flag: the flag to be checked
1613 *  uid: the user ID of the user making the request
1614 *
1615 * Return values:
1616 *   0: the permission should be granted
1617 *  -1: the permission check failed (also returned if vm == null)
1618 */
1619int
1620vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1621{
1622	/* root has no restrictions */
1623	if (uid == 0)
1624		return (0);
1625
1626	if ((vmc->vmc_insflags & flag) == 0)
1627		return (-1);
1628
1629	return (0);
1630}
1631
1632/*
1633 * vm_checkaccess
1634 *
1635 * Checks if the user represented by the 'uid' parameter is allowed to
1636 * access the file described by the 'path' parameter.
1637 *
1638 * Parameters:
1639 *  fd: the file descriptor of the opened file
1640 *  uflag: check if the userid has access to the file
1641 *  uid: the user ID of the user making the request
1642 *  amode: the access flags of R_OK and W_OK
1643 *
1644 * Return values:
1645 *   0: the permission should be granted
1646 *  -1: the permission check failed
1647 */
1648int
1649vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1650{
1651	struct group	*gr;
1652	struct passwd	*pw;
1653	char		**grmem;
1654	struct stat	 st;
1655	mode_t		 mode;
1656
1657	if (fd == -1)
1658		return (-1);
1659
1660	/*
1661	 * File has to be accessible and a regular file
1662	 */
1663	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1664		return (-1);
1665
1666	/* root has no restrictions */
1667	if (uid == 0 || uflag == 0)
1668		return (0);
1669
1670	/* check other */
1671	mode = amode & W_OK ? S_IWOTH : 0;
1672	mode |= amode & R_OK ? S_IROTH : 0;
1673	if ((st.st_mode & mode) == mode)
1674		return (0);
1675
1676	/* check user */
1677	mode = amode & W_OK ? S_IWUSR : 0;
1678	mode |= amode & R_OK ? S_IRUSR : 0;
1679	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1680		return (0);
1681
1682	/* check groups */
1683	mode = amode & W_OK ? S_IWGRP : 0;
1684	mode |= amode & R_OK ? S_IRGRP : 0;
1685	if ((st.st_mode & mode) != mode)
1686		return (-1);
1687	if ((pw = getpwuid(uid)) == NULL)
1688		return (-1);
1689	if (pw->pw_gid == st.st_gid)
1690		return (0);
1691	if ((gr = getgrgid(st.st_gid)) != NULL) {
1692		for (grmem = gr->gr_mem; *grmem; grmem++)
1693			if (strcmp(*grmem, pw->pw_name) == 0)
1694				return (0);
1695	}
1696
1697	return (-1);
1698}
1699
1700int
1701vm_opentty(struct vmd_vm *vm)
1702{
1703	struct ptmget		 ptm;
1704	struct stat		 st;
1705	struct group		*gr;
1706	uid_t			 uid;
1707	gid_t			 gid;
1708	mode_t			 mode;
1709	int			 on;
1710
1711	/*
1712	 * Open tty with pre-opened PTM fd
1713	 */
1714	if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1))
1715		return (-1);
1716
1717	/*
1718	 * We use user ioctl(2) mode to pass break commands.
1719	 */
1720	on = 1;
1721	if (ioctl(ptm.cfd, TIOCUCNTL, &on))
1722		fatal("could not enable user ioctl mode");
1723
1724	vm->vm_tty = ptm.cfd;
1725	close(ptm.sfd);
1726	if ((vm->vm_ttyname = strdup(ptm.sn)) == NULL)
1727		goto fail;
1728
1729	uid = vm->vm_uid;
1730	gid = vm->vm_params.vmc_owner.gid;
1731
1732	if (vm->vm_params.vmc_owner.gid != -1) {
1733		mode = 0660;
1734	} else if ((gr = getgrnam("tty")) != NULL) {
1735		gid = gr->gr_gid;
1736		mode = 0620;
1737	} else {
1738		mode = 0600;
1739		gid = 0;
1740	}
1741
1742	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1743	    __func__, vm->vm_params.vmc_params.vcp_name,
1744	    vm->vm_ttyname, uid, gid, mode);
1745
1746	/*
1747	 * Change ownership and mode of the tty as required.
1748	 * Loosely based on the implementation of sshpty.c
1749	 */
1750	if (stat(vm->vm_ttyname, &st) == -1)
1751		goto fail;
1752
1753	if (st.st_uid != uid || st.st_gid != gid) {
1754		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1755			log_warn("chown %s %d %d failed, uid %d",
1756			    vm->vm_ttyname, uid, gid, getuid());
1757
1758			/* Ignore failure on read-only filesystems */
1759			if (!((errno == EROFS) &&
1760			    (st.st_uid == uid || st.st_uid == 0)))
1761				goto fail;
1762		}
1763	}
1764
1765	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1766		if (chmod(vm->vm_ttyname, mode) == -1) {
1767			log_warn("chmod %s %o failed, uid %d",
1768			    vm->vm_ttyname, mode, getuid());
1769
1770			/* Ignore failure on read-only filesystems */
1771			if (!((errno == EROFS) &&
1772			    (st.st_uid == uid || st.st_uid == 0)))
1773				goto fail;
1774		}
1775	}
1776
1777	return (0);
1778 fail:
1779	vm_closetty(vm);
1780	return (-1);
1781}
1782
1783void
1784vm_closetty(struct vmd_vm *vm)
1785{
1786	if (vm->vm_tty != -1) {
1787		/* Release and close the tty */
1788		if (fchown(vm->vm_tty, 0, 0) == -1)
1789			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1790		if (fchmod(vm->vm_tty, 0666) == -1)
1791			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1792		close(vm->vm_tty);
1793		vm->vm_tty = -1;
1794	}
1795	free(vm->vm_ttyname);
1796	vm->vm_ttyname = NULL;
1797}
1798
1799void
1800switch_remove(struct vmd_switch *vsw)
1801{
1802	if (vsw == NULL)
1803		return;
1804
1805	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1806
1807	free(vsw->sw_group);
1808	free(vsw->sw_name);
1809	free(vsw);
1810}
1811
1812struct vmd_switch *
1813switch_getbyname(const char *name)
1814{
1815	struct vmd_switch	*vsw;
1816
1817	if (name == NULL)
1818		return (NULL);
1819	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1820		if (strcmp(vsw->sw_name, name) == 0)
1821			return (vsw);
1822	}
1823
1824	return (NULL);
1825}
1826
1827struct vmd_user *
1828user_get(uid_t uid)
1829{
1830	struct vmd_user		*usr;
1831
1832	if (uid == 0)
1833		return (NULL);
1834
1835	/* first try to find an existing user */
1836	TAILQ_FOREACH(usr, env->vmd_users, usr_entry) {
1837		if (usr->usr_id.uid == uid)
1838			goto done;
1839	}
1840
1841	if ((usr = calloc(1, sizeof(*usr))) == NULL) {
1842		log_warn("could not allocate user");
1843		return (NULL);
1844	}
1845
1846	usr->usr_id.uid = uid;
1847	usr->usr_id.gid = -1;
1848	TAILQ_INSERT_TAIL(env->vmd_users, usr, usr_entry);
1849
1850 done:
1851	DPRINTF("%s: uid %d #%d +",
1852	    __func__, usr->usr_id.uid, usr->usr_refcnt + 1);
1853	usr->usr_refcnt++;
1854
1855	return (usr);
1856}
1857
1858void
1859user_put(struct vmd_user *usr)
1860{
1861	if (usr == NULL)
1862		return;
1863
1864	DPRINTF("%s: uid %d #%d -",
1865	    __func__, usr->usr_id.uid, usr->usr_refcnt - 1);
1866
1867	if (--usr->usr_refcnt > 0)
1868		return;
1869
1870	TAILQ_REMOVE(env->vmd_users, usr, usr_entry);
1871	free(usr);
1872}
1873
1874void
1875user_inc(struct vm_create_params *vcp, struct vmd_user *usr, int inc)
1876{
1877	char	 mem[FMT_SCALED_STRSIZE];
1878
1879	if (usr == NULL)
1880		return;
1881
1882	/* increment or decrement counters */
1883	inc = inc ? 1 : -1;
1884
1885	usr->usr_maxcpu += vcp->vcp_ncpus * inc;
1886	usr->usr_maxmem += vcp->vcp_memranges[0].vmr_size * inc;
1887	usr->usr_maxifs += vcp->vcp_nnics * inc;
1888
1889	if (log_getverbose() > 1) {
1890		(void)fmt_scaled(usr->usr_maxmem * 1024 * 1024, mem);
1891		log_debug("%s: %c uid %d ref %d cpu %llu mem %s ifs %llu",
1892		    __func__, inc == 1 ? '+' : '-',
1893		    usr->usr_id.uid, usr->usr_refcnt,
1894		    usr->usr_maxcpu, mem, usr->usr_maxifs);
1895	}
1896}
1897
1898int
1899user_checklimit(struct vmd_user *usr, struct vm_create_params *vcp)
1900{
1901	const char	*limit = "";
1902
1903	/* XXX make the limits configurable */
1904	if (usr->usr_maxcpu > VM_DEFAULT_USER_MAXCPU) {
1905		limit = "cpu ";
1906		goto fail;
1907	}
1908	if (usr->usr_maxmem > VM_DEFAULT_USER_MAXMEM) {
1909		limit = "memory ";
1910		goto fail;
1911	}
1912	if (usr->usr_maxifs > VM_DEFAULT_USER_MAXIFS) {
1913		limit = "interface ";
1914		goto fail;
1915	}
1916
1917	return (0);
1918
1919 fail:
1920	log_warnx("%s: user %d %slimit reached", vcp->vcp_name,
1921	    usr->usr_id.uid, limit);
1922	return (-1);
1923}
1924
1925char *
1926get_string(uint8_t *ptr, size_t len)
1927{
1928	size_t	 i;
1929
1930	for (i = 0; i < len; i++)
1931		if (!isprint(ptr[i]))
1932			break;
1933
1934	return strndup(ptr, i);
1935}
1936
1937uint32_t
1938prefixlen2mask(uint8_t prefixlen)
1939{
1940	if (prefixlen == 0)
1941		return (0);
1942
1943	if (prefixlen > 32)
1944		prefixlen = 32;
1945
1946	return (htonl(0xffffffff << (32 - prefixlen)));
1947}
1948
1949void
1950prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask)
1951{
1952	struct in6_addr	 s6;
1953	int		 i;
1954
1955	if (prefixlen > 128)
1956		prefixlen = 128;
1957
1958	memset(&s6, 0, sizeof(s6));
1959	for (i = 0; i < prefixlen / 8; i++)
1960		s6.s6_addr[i] = 0xff;
1961	i = prefixlen % 8;
1962	if (i)
1963		s6.s6_addr[prefixlen / 8] = 0xff00 >> i;
1964
1965	memcpy(mask, &s6, sizeof(s6));
1966}
1967
1968void
1969getmonotime(struct timeval *tv)
1970{
1971	struct timespec	 ts;
1972
1973	if (clock_gettime(CLOCK_MONOTONIC, &ts))
1974		fatal("clock_gettime");
1975
1976	TIMESPEC_TO_TIMEVAL(tv, &ts);
1977}
1978