vmd.c revision 1.98
1/*	$OpenBSD: vmd.c,v 1.98 2018/07/15 14:36:54 reyk Exp $	*/
2
3/*
4 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/param.h>	/* nitems */
20#include <sys/queue.h>
21#include <sys/wait.h>
22#include <sys/cdefs.h>
23#include <sys/stat.h>
24#include <sys/tty.h>
25#include <sys/ttycom.h>
26#include <sys/ioctl.h>
27
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31#include <termios.h>
32#include <errno.h>
33#include <event.h>
34#include <fcntl.h>
35#include <pwd.h>
36#include <signal.h>
37#include <syslog.h>
38#include <unistd.h>
39#include <util.h>
40#include <ctype.h>
41#include <pwd.h>
42#include <grp.h>
43
44#include <machine/specialreg.h>
45#include <machine/vmmvar.h>
46
47#include "proc.h"
48#include "atomicio.h"
49#include "vmd.h"
50
51__dead void usage(void);
52
53int	 main(int, char **);
54int	 vmd_configure(void);
55void	 vmd_sighdlr(int sig, short event, void *arg);
56void	 vmd_shutdown(void);
57int	 vmd_control_run(void);
58int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
59int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
60int	 vmd_check_vmh(struct vm_dump_header *);
61
62int	 vm_instance(struct privsep *, struct vmd_vm **,
63	    struct vmop_create_params *, uid_t);
64int	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
65
66struct vmd	*env;
67
68static struct privsep_proc procs[] = {
69	/* Keep "priv" on top as procs[0] */
70	{ "priv",	PROC_PRIV,	NULL, priv },
71	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
72	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm, vmm_shutdown },
73};
74
75/* For the privileged process */
76static struct privsep_proc *proc_priv = &procs[0];
77static struct passwd proc_privpw;
78static const uint8_t zero_mac[ETHER_ADDR_LEN];
79
80int
81vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
82{
83	struct privsep			*ps = p->p_ps;
84	int				 res = 0, ret = 0, cmd = 0, verbose;
85	unsigned int			 v = 0, flags;
86	struct vmop_create_params	 vmc;
87	struct vmop_id			 vid;
88	struct vmop_result		 vmr;
89	struct vm_dump_header		 vmh;
90	struct vmd_vm			*vm = NULL;
91	char				*str = NULL;
92	uint32_t			 id = 0;
93	struct control_sock		*rcs;
94
95	switch (imsg->hdr.type) {
96	case IMSG_VMDOP_START_VM_REQUEST:
97		IMSG_SIZE_CHECK(imsg, &vmc);
98		memcpy(&vmc, imsg->data, sizeof(vmc));
99		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
100		if (vmc.vmc_flags == 0) {
101			/* start an existing VM with pre-configured options */
102			if (!(ret == -1 && errno == EALREADY &&
103			    vm->vm_running == 0)) {
104				res = errno;
105				cmd = IMSG_VMDOP_START_VM_RESPONSE;
106			}
107		} else if (ret != 0) {
108			res = errno;
109			cmd = IMSG_VMDOP_START_VM_RESPONSE;
110		}
111		if (res == 0 &&
112		    config_setvm(ps, vm,
113		    imsg->hdr.peerid, vm->vm_params.vmc_owner.uid) == -1) {
114			res = errno;
115			cmd = IMSG_VMDOP_START_VM_RESPONSE;
116		}
117		break;
118	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
119		IMSG_SIZE_CHECK(imsg, &vid);
120		memcpy(&vid, imsg->data, sizeof(vid));
121		flags = vid.vid_flags;
122
123		if ((id = vid.vid_id) == 0) {
124			/* Lookup vm (id) by name */
125			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
126				res = ENOENT;
127				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
128				break;
129			} else if (vm->vm_shutdown &&
130			    (flags & VMOP_FORCE) == 0) {
131				res = EALREADY;
132				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
133				break;
134			} else if (vm->vm_running == 0) {
135				res = EINVAL;
136				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
137				break;
138			}
139			id = vm->vm_vmid;
140		} else if ((vm = vm_getbyvmid(id)) == NULL) {
141			res = ENOENT;
142			cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
143			break;
144		}
145		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
146		    vid.vid_uid) != 0) {
147			res = EPERM;
148			cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
149			break;
150		}
151
152		memset(&vid, 0, sizeof(vid));
153		vid.vid_id = id;
154		vid.vid_flags = flags;
155		if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
156		    imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
157			return (-1);
158		break;
159	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
160		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
161		break;
162	case IMSG_VMDOP_LOAD:
163		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
164		str = get_string((uint8_t *)imsg->data,
165		    IMSG_DATA_SIZE(imsg));
166	case IMSG_VMDOP_RELOAD:
167		if (vmd_reload(0, str) == -1)
168			cmd = IMSG_CTL_FAIL;
169		else
170			cmd = IMSG_CTL_OK;
171		free(str);
172		break;
173	case IMSG_CTL_RESET:
174		IMSG_SIZE_CHECK(imsg, &v);
175		memcpy(&v, imsg->data, sizeof(v));
176		if (vmd_reload(v, NULL) == -1)
177			cmd = IMSG_CTL_FAIL;
178		else
179			cmd = IMSG_CTL_OK;
180		break;
181	case IMSG_CTL_VERBOSE:
182		IMSG_SIZE_CHECK(imsg, &verbose);
183		memcpy(&verbose, imsg->data, sizeof(verbose));
184		log_setverbose(verbose);
185
186		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
187		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
188		cmd = IMSG_CTL_OK;
189		break;
190	case IMSG_VMDOP_PAUSE_VM:
191	case IMSG_VMDOP_UNPAUSE_VM:
192		IMSG_SIZE_CHECK(imsg, &vid);
193		memcpy(&vid, imsg->data, sizeof(vid));
194		if (vid.vid_id == 0) {
195			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
196				res = ENOENT;
197				cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
198				break;
199			} else {
200				vid.vid_id = vm->vm_vmid;
201			}
202		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
203			res = ENOENT;
204			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
205			break;
206		}
207		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
208		    vid.vid_uid) != 0) {
209			res = EPERM;
210			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
211			break;
212		}
213		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
214		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
215		break;
216	case IMSG_VMDOP_SEND_VM_REQUEST:
217		IMSG_SIZE_CHECK(imsg, &vid);
218		memcpy(&vid, imsg->data, sizeof(vid));
219		id = vid.vid_id;
220		if (vid.vid_id == 0) {
221			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
222				res = ENOENT;
223				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
224				close(imsg->fd);
225				break;
226			} else {
227				vid.vid_id = vm->vm_vmid;
228			}
229		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
230			res = ENOENT;
231			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
232			close(imsg->fd);
233			break;
234		} else {
235		}
236		vmr.vmr_id = vid.vid_id;
237		log_debug("%s: sending fd to vmm", __func__);
238		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
239		    imsg->hdr.peerid, imsg->fd, &vid, sizeof(vid));
240		break;
241	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
242		IMSG_SIZE_CHECK(imsg, &vid);
243		memcpy(&vid, imsg->data, sizeof(vid));
244		if (imsg->fd == -1) {
245			log_warnx("%s: invalid fd", __func__);
246			return (-1);
247		}
248		if (atomicio(read, imsg->fd, &vmh, sizeof(vmh)) !=
249		    sizeof(vmh)) {
250			log_warnx("%s: error reading vmh from received vm",
251			    __func__);
252			res = EIO;
253			close(imsg->fd);
254			cmd = IMSG_VMDOP_START_VM_RESPONSE;
255			break;
256		}
257
258		if (vmd_check_vmh(&vmh)) {
259			res = ENOENT;
260			close(imsg->fd);
261			cmd = IMSG_VMDOP_START_VM_RESPONSE;
262			break;
263		}
264		if (atomicio(read, imsg->fd, &vmc, sizeof(vmc)) !=
265		    sizeof(vmc)) {
266			log_warnx("%s: error reading vmc from received vm",
267			    __func__);
268			res = EIO;
269			close(imsg->fd);
270			cmd = IMSG_VMDOP_START_VM_RESPONSE;
271			break;
272		}
273		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
274		    sizeof(vmc.vmc_params.vcp_name));
275		vmc.vmc_params.vcp_id = 0;
276
277		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
278		if (ret != 0) {
279			res = errno;
280			cmd = IMSG_VMDOP_START_VM_RESPONSE;
281			close(imsg->fd);
282		} else {
283			vm->vm_received = 1;
284			config_setvm(ps, vm, imsg->hdr.peerid,
285			    vmc.vmc_owner.uid);
286			log_debug("%s: sending fd to vmm", __func__);
287			proc_compose_imsg(ps, PROC_VMM, -1,
288			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, imsg->fd,
289			    NULL, 0);
290		}
291		break;
292	case IMSG_VMDOP_DONE:
293		control_reset(&ps->ps_csock);
294		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
295			control_reset(rcs);
296		cmd = 0;
297		break;
298	default:
299		return (-1);
300	}
301
302	switch (cmd) {
303	case 0:
304		break;
305	case IMSG_VMDOP_START_VM_RESPONSE:
306	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
307		memset(&vmr, 0, sizeof(vmr));
308		vmr.vmr_result = res;
309		vmr.vmr_id = id;
310		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
311		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
312			return (-1);
313		break;
314	default:
315		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
316		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
317			return (-1);
318		break;
319	}
320
321	return (0);
322}
323
324int
325vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
326{
327	struct vmop_result	 vmr;
328	struct privsep		*ps = p->p_ps;
329	int			 res = 0;
330	struct vmd_vm		*vm;
331	struct vm_create_params	*vcp;
332	struct vmop_info_result	 vir;
333
334	switch (imsg->hdr.type) {
335	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
336		IMSG_SIZE_CHECK(imsg, &vmr);
337		memcpy(&vmr, imsg->data, sizeof(vmr));
338		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
339			break;
340		proc_compose_imsg(ps, PROC_CONTROL, -1,
341		    imsg->hdr.type, imsg->hdr.peerid, -1,
342		    imsg->data, sizeof(imsg->data));
343		log_info("%s: paused vm %d successfully",
344		    vm->vm_params.vmc_params.vcp_name,
345		    vm->vm_vmid);
346		break;
347	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
348		IMSG_SIZE_CHECK(imsg, &vmr);
349		memcpy(&vmr, imsg->data, sizeof(vmr));
350		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
351			break;
352		proc_compose_imsg(ps, PROC_CONTROL, -1,
353		    imsg->hdr.type, imsg->hdr.peerid, -1,
354		    imsg->data, sizeof(imsg->data));
355		log_info("%s: unpaused vm %d successfully.",
356		    vm->vm_params.vmc_params.vcp_name,
357		    vm->vm_vmid);
358		break;
359	case IMSG_VMDOP_START_VM_RESPONSE:
360		IMSG_SIZE_CHECK(imsg, &vmr);
361		memcpy(&vmr, imsg->data, sizeof(vmr));
362		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
363			break;
364		vm->vm_pid = vmr.vmr_pid;
365		vcp = &vm->vm_params.vmc_params;
366		vcp->vcp_id = vmr.vmr_id;
367
368		/*
369		 * If the peerid is not -1, forward the response back to the
370		 * the control socket.  If it is -1, the request originated
371		 * from the parent, not the control socket.
372		 */
373		if (vm->vm_peerid != (uint32_t)-1) {
374			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
375			    sizeof(vmr.vmr_ttyname));
376			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
377			    imsg->hdr.type, vm->vm_peerid, -1,
378			    &vmr, sizeof(vmr)) == -1) {
379				errno = vmr.vmr_result;
380				log_warn("%s: failed to foward vm result",
381				    vcp->vcp_name);
382				vm_remove(vm, __func__);
383				return (-1);
384			}
385		}
386
387		if (vmr.vmr_result) {
388			errno = vmr.vmr_result;
389			log_warn("%s: failed to start vm", vcp->vcp_name);
390			vm_remove(vm, __func__);
391			break;
392		}
393
394		/* Now configure all the interfaces */
395		if (vm_priv_ifconfig(ps, vm) == -1) {
396			log_warn("%s: failed to configure vm", vcp->vcp_name);
397			vm_remove(vm, __func__);
398			break;
399		}
400
401		log_info("%s: started vm %d successfully, tty %s",
402		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
403		break;
404	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
405		IMSG_SIZE_CHECK(imsg, &vmr);
406		memcpy(&vmr, imsg->data, sizeof(vmr));
407		DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
408		    __func__, vmr.vmr_id);
409		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
410		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
411			break;
412		if (vmr.vmr_result == 0) {
413			/* Mark VM as shutting down */
414			vm->vm_shutdown = 1;
415		}
416		break;
417	case IMSG_VMDOP_SEND_VM_RESPONSE:
418		IMSG_SIZE_CHECK(imsg, &vmr);
419		memcpy(&vmr, imsg->data, sizeof(vmr));
420		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
421			break;
422		if (!vmr.vmr_result)
423			log_info("%s: sent vm %d successfully.",
424			    vm->vm_params.vmc_params.vcp_name,
425			    vm->vm_vmid);
426	case IMSG_VMDOP_TERMINATE_VM_EVENT:
427		IMSG_SIZE_CHECK(imsg, &vmr);
428		memcpy(&vmr, imsg->data, sizeof(vmr));
429		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
430		    __func__, vmr.vmr_id, vmr.vmr_result);
431		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
432			log_debug("%s: vm %d is no longer available",
433			    __func__, vmr.vmr_id);
434			break;
435		}
436		if (vmr.vmr_result != EAGAIN) {
437			if (vm->vm_from_config)
438				vm_stop(vm, 0, __func__);
439			else
440				vm_remove(vm, __func__);
441		} else {
442			/* Stop VM instance but keep the tty open */
443			vm_stop(vm, 1, __func__);
444			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
445		}
446
447		/* Send a response if a control client is waiting for it */
448		if (imsg->hdr.peerid != (uint32_t)-1) {
449			/* the error is meaningless for deferred responses */
450			vmr.vmr_result = 0;
451
452			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
453			    IMSG_VMDOP_TERMINATE_VM_RESPONSE,
454			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
455				return (-1);
456		}
457		break;
458	case IMSG_VMDOP_GET_INFO_VM_DATA:
459		IMSG_SIZE_CHECK(imsg, &vir);
460		memcpy(&vir, imsg->data, sizeof(vir));
461		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
462			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
463			if (vm->vm_ttyname != NULL)
464				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
465				    sizeof(vir.vir_ttyname));
466			if (vm->vm_shutdown) {
467				/* XXX there might be a nicer way */
468				(void)strlcat(vir.vir_info.vir_name,
469				    " - stopping",
470				    sizeof(vir.vir_info.vir_name));
471			}
472			/* get the user id who started the vm */
473			vir.vir_uid = vm->vm_uid;
474			vir.vir_gid = vm->vm_params.vmc_owner.gid;
475		}
476		if (proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type,
477		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
478			log_debug("%s: GET_INFO_VM failed for vm %d, removing",
479			    __func__, vm->vm_vmid);
480			vm_remove(vm, __func__);
481			return (-1);
482		}
483		break;
484	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
485		/*
486		 * PROC_VMM has responded with the *running* VMs, now we
487		 * append the others. These use the special value 0 for their
488		 * kernel id to indicate that they are not running.
489		 */
490		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
491			if (!vm->vm_running) {
492				memset(&vir, 0, sizeof(vir));
493				vir.vir_info.vir_id = vm->vm_vmid;
494				strlcpy(vir.vir_info.vir_name,
495				    vm->vm_params.vmc_params.vcp_name,
496				    VMM_MAX_NAME_LEN);
497				vir.vir_info.vir_memory_size =
498				    vm->vm_params.vmc_params.
499				    vcp_memranges[0].vmr_size;
500				vir.vir_info.vir_ncpus =
501				    vm->vm_params.vmc_params.vcp_ncpus;
502				/* get the configured user id for this vm */
503				vir.vir_uid = vm->vm_params.vmc_owner.uid;
504				vir.vir_gid = vm->vm_params.vmc_owner.gid;
505				if (proc_compose_imsg(ps, PROC_CONTROL, -1,
506				    IMSG_VMDOP_GET_INFO_VM_DATA,
507				    imsg->hdr.peerid, -1, &vir,
508				    sizeof(vir)) == -1) {
509					log_debug("%s: GET_INFO_VM_END failed",
510					    __func__);
511					vm_remove(vm, __func__);
512					return (-1);
513				}
514			}
515		}
516		IMSG_SIZE_CHECK(imsg, &res);
517		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
518		break;
519	default:
520		return (-1);
521	}
522
523	return (0);
524}
525
526int
527vmd_check_vmh(struct vm_dump_header *vmh)
528{
529	int i;
530	unsigned int code, leaf;
531	unsigned int a, b, c, d;
532
533
534	if (vmh->vmh_version != VM_DUMP_VERSION) {
535		log_warnx("%s: incompatible dump version", __func__);
536		return (-1);
537	}
538
539	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
540		code = vmh->vmh_cpuids[i].code;
541		leaf = vmh->vmh_cpuids[i].leaf;
542		if (leaf != 0x00) {
543			log_debug("%s: invalid leaf 0x%x for code 0x%x",
544			    __func__, leaf, code);
545			return (-1);
546		}
547
548		switch (code) {
549		case 0x00:
550			CPUID_LEAF(code, leaf, a, b, c, d);
551			if (vmh->vmh_cpuids[i].a > a) {
552				log_debug("%s: incompatible cpuid level",
553				    __func__);
554				return (-1);
555			}
556			if (!(vmh->vmh_cpuids[i].b == b &&
557			    vmh->vmh_cpuids[i].c == c &&
558			    vmh->vmh_cpuids[i].d == d)) {
559				log_debug("%s: incompatible cpu brand",
560				    __func__);
561				return (-1);
562			}
563			break;
564
565		case 0x01:
566			CPUID_LEAF(code, leaf, a, b, c, d);
567			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
568			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
569				log_debug("%s: incompatible cpu features "
570				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
571				    code, leaf);
572				return (-1);
573			}
574			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
575			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
576				log_debug("%s: incompatible cpu features "
577				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
578				    code, leaf);
579				return (-1);
580			}
581			break;
582
583		case 0x07:
584			CPUID_LEAF(code, leaf, a, b, c, d);
585			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
586			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
587				log_debug("%s: incompatible cpu features "
588				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
589				    code, leaf);
590				return (-1);
591			}
592			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
593			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
594				log_debug("%s: incompatible cpu features "
595				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
596				    code, leaf);
597				return (-1);
598			}
599			break;
600
601		case 0x0d:
602			CPUID_LEAF(code, leaf, a, b, c, d);
603			if (vmh->vmh_cpuids[i].b > b) {
604				log_debug("%s: incompatible cpu: insufficient "
605				    "max save area for enabled XCR0 features",
606				    __func__);
607				return (-1);
608			}
609			if (vmh->vmh_cpuids[i].c > c) {
610				log_debug("%s: incompatible cpu: insufficient "
611				    "max save area for supported XCR0 features",
612				    __func__);
613				return (-1);
614			}
615			break;
616
617		case 0x80000001:
618			CPUID_LEAF(code, leaf, a, b, c, d);
619			if ((vmh->vmh_cpuids[i].a & a) !=
620			    vmh->vmh_cpuids[i].a) {
621				log_debug("%s: incompatible cpu features "
622				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
623				    code, leaf);
624				return (-1);
625			}
626			if ((vmh->vmh_cpuids[i].c & c) !=
627			    vmh->vmh_cpuids[i].c) {
628				log_debug("%s: incompatible cpu features "
629				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
630				    code, leaf);
631				return (-1);
632			}
633			if ((vmh->vmh_cpuids[i].d & d) !=
634			    vmh->vmh_cpuids[i].d) {
635				log_debug("%s: incompatible cpu features "
636				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
637				    code, leaf);
638				return (-1);
639			}
640			break;
641
642		default:
643			log_debug("%s: unknown code 0x%x", __func__, code);
644			return (-1);
645		}
646	}
647
648	return (0);
649}
650
651void
652vmd_sighdlr(int sig, short event, void *arg)
653{
654	if (privsep_process != PROC_PARENT)
655		return;
656	log_debug("%s: handling signal", __func__);
657
658	switch (sig) {
659	case SIGHUP:
660		log_info("%s: reload requested with SIGHUP", __func__);
661
662		/*
663		 * This is safe because libevent uses async signal handlers
664		 * that run in the event loop and not in signal context.
665		 */
666		(void)vmd_reload(0, NULL);
667		break;
668	case SIGPIPE:
669		log_info("%s: ignoring SIGPIPE", __func__);
670		break;
671	case SIGUSR1:
672		log_info("%s: ignoring SIGUSR1", __func__);
673		break;
674	case SIGTERM:
675	case SIGINT:
676		vmd_shutdown();
677		break;
678	default:
679		fatalx("unexpected signal");
680	}
681}
682
683__dead void
684usage(void)
685{
686	extern char *__progname;
687	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
688	    __progname);
689	exit(1);
690}
691
692int
693main(int argc, char **argv)
694{
695	struct privsep		*ps;
696	int			 ch;
697	const char		*conffile = VMD_CONF;
698	enum privsep_procid	 proc_id = PROC_PARENT;
699	int			 proc_instance = 0;
700	const char		*errp, *title = NULL;
701	int			 argc0 = argc;
702
703	log_init(0, LOG_DAEMON);
704
705	if ((env = calloc(1, sizeof(*env))) == NULL)
706		fatal("calloc: env");
707
708	while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) {
709		switch (ch) {
710		case 'D':
711			if (cmdline_symset(optarg) < 0)
712				log_warnx("could not parse macro definition %s",
713				    optarg);
714			break;
715		case 'd':
716			env->vmd_debug = 2;
717			break;
718		case 'f':
719			conffile = optarg;
720			break;
721		case 'v':
722			env->vmd_verbose++;
723			break;
724		case 'n':
725			env->vmd_noaction = 1;
726			break;
727		case 'P':
728			title = optarg;
729			proc_id = proc_getid(procs, nitems(procs), title);
730			if (proc_id == PROC_MAX)
731				fatalx("invalid process name");
732			break;
733		case 'I':
734			proc_instance = strtonum(optarg, 0,
735			    PROC_MAX_INSTANCES, &errp);
736			if (errp)
737				fatalx("invalid process instance");
738			break;
739		default:
740			usage();
741		}
742	}
743
744	argc -= optind;
745	if (argc > 0)
746		usage();
747
748	if (env->vmd_noaction && !env->vmd_debug)
749		env->vmd_debug = 1;
750
751	/* check for root privileges */
752	if (env->vmd_noaction == 0) {
753		if (geteuid())
754			fatalx("need root privileges");
755	}
756
757	ps = &env->vmd_ps;
758	ps->ps_env = env;
759	env->vmd_fd = -1;
760
761	if (config_init(env) == -1)
762		fatal("failed to initialize configuration");
763
764	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
765		fatal("unknown user %s", VMD_USER);
766
767	/* First proc runs as root without pledge but in default chroot */
768	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
769	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
770
771	/* Open /dev/vmm */
772	if (env->vmd_noaction == 0) {
773		env->vmd_fd = open(VMM_NODE, O_RDWR);
774		if (env->vmd_fd == -1)
775			fatal("%s", VMM_NODE);
776	}
777
778	/* Configure the control socket */
779	ps->ps_csock.cs_name = SOCKET_NAME;
780	TAILQ_INIT(&ps->ps_rcsocks);
781
782	/* Configuration will be parsed after forking the children */
783	env->vmd_conffile = conffile;
784
785	log_init(env->vmd_debug, LOG_DAEMON);
786	log_setverbose(env->vmd_verbose);
787
788	if (env->vmd_noaction)
789		ps->ps_noaction = 1;
790	ps->ps_instance = proc_instance;
791	if (title != NULL)
792		ps->ps_title[proc_id] = title;
793
794	/* only the parent returns */
795	proc_init(ps, procs, nitems(procs), argc0, argv, proc_id);
796
797	log_procinit("parent");
798	if (!env->vmd_debug && daemon(0, 0) == -1)
799		fatal("can't daemonize");
800
801	if (ps->ps_noaction == 0)
802		log_info("startup");
803
804	event_init();
805
806	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
807	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
808	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
809	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
810	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
811
812	signal_add(&ps->ps_evsigint, NULL);
813	signal_add(&ps->ps_evsigterm, NULL);
814	signal_add(&ps->ps_evsighup, NULL);
815	signal_add(&ps->ps_evsigpipe, NULL);
816	signal_add(&ps->ps_evsigusr1, NULL);
817
818	if (!env->vmd_noaction)
819		proc_connect(ps);
820
821	if (vmd_configure() == -1)
822		fatalx("configuration failed");
823
824	event_dispatch();
825
826	log_debug("parent exiting");
827
828	return (0);
829}
830
831int
832vmd_configure(void)
833{
834	struct vmd_vm		*vm;
835	struct vmd_switch	*vsw;
836
837	if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
838		fatal("open %s", PATH_PTMDEV);
839
840	/*
841	 * pledge in the parent process:
842	 * stdio - for malloc and basic I/O including events.
843	 * rpath - for reload to open and read the configuration files.
844	 * wpath - for opening disk images and tap devices.
845	 * tty - for openpty and TIOCUCNTL.
846	 * proc - run kill to terminate its children safely.
847	 * sendfd - for disks, interfaces and other fds.
848	 * recvfd - for send and receive.
849	 * getpw - lookup user or group id by name.
850	 * chown, fattr - change tty ownership
851	 * flock - locking disk files
852	 */
853	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
854	    " chown fattr flock", NULL) == -1)
855		fatal("pledge");
856
857	if (parse_config(env->vmd_conffile) == -1) {
858		proc_kill(&env->vmd_ps);
859		exit(1);
860	}
861
862	if (env->vmd_noaction) {
863		fprintf(stderr, "configuration OK\n");
864		proc_kill(&env->vmd_ps);
865		exit(0);
866	}
867
868	/* Send shared global configuration to all children */
869	if (config_setconfig(env) == -1)
870		return (-1);
871
872	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
873		if (vsw->sw_running)
874			continue;
875		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
876			log_warn("%s: failed to create switch %s",
877			    __func__, vsw->sw_name);
878			switch_remove(vsw);
879			return (-1);
880		}
881	}
882
883	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
884		if (vm->vm_disabled) {
885			log_debug("%s: not creating vm %s (disabled)",
886			    __func__,
887			    vm->vm_params.vmc_params.vcp_name);
888			continue;
889		}
890		if (config_setvm(&env->vmd_ps, vm,
891		    -1, vm->vm_params.vmc_owner.uid) == -1)
892			return (-1);
893	}
894
895	return (0);
896}
897
898int
899vmd_reload(unsigned int reset, const char *filename)
900{
901	struct vmd_vm		*vm, *next_vm;
902	struct vmd_switch	*vsw;
903	int			 reload = 0;
904
905	/* Switch back to the default config file */
906	if (filename == NULL || *filename == '\0') {
907		filename = env->vmd_conffile;
908		reload = 1;
909	}
910
911	log_debug("%s: level %d config file %s", __func__, reset, filename);
912
913	if (reset) {
914		/* Purge the configuration */
915		config_purge(env, reset);
916		config_setreset(env, reset);
917	} else {
918		/*
919		 * Load or reload the configuration.
920		 *
921		 * Reloading removes all non-running VMs before processing the
922		 * config file, whereas loading only adds to the existing list
923		 * of VMs.
924		 */
925
926		if (reload) {
927			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
928			    next_vm) {
929				if (vm->vm_running == 0) {
930					DPRINTF("%s: calling vm_remove",
931					    __func__);
932					vm_remove(vm, __func__);
933				}
934			}
935		}
936
937		if (parse_config(filename) == -1) {
938			log_debug("%s: failed to load config file %s",
939			    __func__, filename);
940			return (-1);
941		}
942
943		if (reload) {
944			/* Update shared global configuration in all children */
945			if (config_setconfig(env) == -1)
946				return (-1);
947		}
948
949		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
950			if (vsw->sw_running)
951				continue;
952			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
953				log_warn("%s: failed to create switch %s",
954				    __func__, vsw->sw_name);
955				switch_remove(vsw);
956				return (-1);
957			}
958		}
959
960		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
961			if (vm->vm_running == 0) {
962				if (vm->vm_disabled) {
963					log_debug("%s: not creating vm %s"
964					    " (disabled)", __func__,
965					    vm->vm_params.vmc_params.vcp_name);
966					continue;
967				}
968				if (config_setvm(&env->vmd_ps, vm,
969				    -1, vm->vm_params.vmc_owner.uid) == -1)
970					return (-1);
971			} else {
972				log_debug("%s: not creating vm \"%s\": "
973				    "(running)", __func__,
974				    vm->vm_params.vmc_params.vcp_name);
975			}
976		}
977	}
978
979	return (0);
980}
981
982void
983vmd_shutdown(void)
984{
985	struct vmd_vm *vm, *vm_next;
986
987	log_debug("%s: performing shutdown", __func__);
988
989	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
990		vm_remove(vm, __func__);
991	}
992
993	proc_kill(&env->vmd_ps);
994	free(env);
995
996	log_warnx("parent terminating");
997	exit(0);
998}
999
1000struct vmd_vm *
1001vm_getbyvmid(uint32_t vmid)
1002{
1003	struct vmd_vm	*vm;
1004
1005	if (vmid == 0)
1006		return (NULL);
1007	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1008		if (vm->vm_vmid == vmid)
1009			return (vm);
1010	}
1011
1012	return (NULL);
1013}
1014
1015struct vmd_vm *
1016vm_getbyid(uint32_t id)
1017{
1018	struct vmd_vm	*vm;
1019
1020	if (id == 0)
1021		return (NULL);
1022	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1023		if (vm->vm_params.vmc_params.vcp_id == id)
1024			return (vm);
1025	}
1026
1027	return (NULL);
1028}
1029
1030uint32_t
1031vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1032{
1033	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1034		return (0);
1035	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1036	    id, vm->vm_vmid);
1037	return (vm->vm_vmid);
1038}
1039
1040uint32_t
1041vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1042{
1043	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1044		return (0);
1045	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1046	    vmid, vm->vm_params.vmc_params.vcp_id);
1047	return (vm->vm_params.vmc_params.vcp_id);
1048}
1049
1050struct vmd_vm *
1051vm_getbyname(const char *name)
1052{
1053	struct vmd_vm	*vm;
1054
1055	if (name == NULL)
1056		return (NULL);
1057	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1058		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1059			return (vm);
1060	}
1061
1062	return (NULL);
1063}
1064
1065struct vmd_vm *
1066vm_getbypid(pid_t pid)
1067{
1068	struct vmd_vm	*vm;
1069
1070	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1071		if (vm->vm_pid == pid)
1072			return (vm);
1073	}
1074
1075	return (NULL);
1076}
1077
1078void
1079vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1080{
1081	struct privsep	*ps = &env->vmd_ps;
1082	unsigned int	 i;
1083
1084	if (vm == NULL)
1085		return;
1086
1087	log_debug("%s: %s %s stopping vm %d%s",
1088	    __func__, ps->ps_title[privsep_process], caller,
1089	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1090
1091	vm->vm_running = 0;
1092	vm->vm_shutdown = 0;
1093
1094	user_inc(&vm->vm_params.vmc_params, vm->vm_user, 0);
1095	user_put(vm->vm_user);
1096
1097	if (vm->vm_iev.ibuf.fd != -1) {
1098		event_del(&vm->vm_iev.ev);
1099		close(vm->vm_iev.ibuf.fd);
1100	}
1101	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
1102		if (vm->vm_disks[i] != -1) {
1103			close(vm->vm_disks[i]);
1104			vm->vm_disks[i] = -1;
1105		}
1106	}
1107	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
1108		if (vm->vm_ifs[i].vif_fd != -1) {
1109			close(vm->vm_ifs[i].vif_fd);
1110			vm->vm_ifs[i].vif_fd = -1;
1111		}
1112		free(vm->vm_ifs[i].vif_name);
1113		free(vm->vm_ifs[i].vif_switch);
1114		free(vm->vm_ifs[i].vif_group);
1115		vm->vm_ifs[i].vif_name = NULL;
1116		vm->vm_ifs[i].vif_switch = NULL;
1117		vm->vm_ifs[i].vif_group = NULL;
1118	}
1119	if (vm->vm_kernel != -1) {
1120		close(vm->vm_kernel);
1121		vm->vm_kernel = -1;
1122	}
1123	if (vm->vm_cdrom != -1) {
1124		close(vm->vm_cdrom);
1125		vm->vm_cdrom = -1;
1126	}
1127	if (!keeptty) {
1128		vm_closetty(vm);
1129		vm->vm_uid = 0;
1130	}
1131}
1132
1133void
1134vm_remove(struct vmd_vm *vm, const char *caller)
1135{
1136	struct privsep	*ps = &env->vmd_ps;
1137
1138	if (vm == NULL)
1139		return;
1140
1141	log_debug("%s: %s %s removing vm %d from running config",
1142	    __func__, ps->ps_title[privsep_process], caller,
1143	    vm->vm_vmid);
1144
1145	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1146
1147	user_put(vm->vm_user);
1148	vm_stop(vm, 0, caller);
1149	free(vm);
1150}
1151
1152int
1153vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1154    struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1155{
1156	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1157	struct vm_create_params	*vcp = &vmc->vmc_params;
1158	struct vmop_owner	*vmo = NULL;
1159	struct vmd_user		*usr = NULL;
1160	uint32_t		 rng;
1161	unsigned int		 i;
1162	struct vmd_switch	*sw;
1163	char			*s;
1164
1165	/* Check if this is an instance of another VM */
1166	if (vm_instance(ps, &vm_parent, vmc, uid) == -1)
1167		return (-1);
1168
1169	errno = 0;
1170	*ret_vm = NULL;
1171
1172	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1173	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1174		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1175		    uid) != 0) {
1176			errno = EPERM;
1177			goto fail;
1178		}
1179		*ret_vm = vm;
1180		errno = EALREADY;
1181		goto fail;
1182	}
1183
1184	if (vm_parent != NULL)
1185		vmo = &vm_parent->vm_params.vmc_insowner;
1186
1187	/* non-root users can only start existing VMs or instances */
1188	if (vm_checkperm(NULL, vmo, uid) != 0) {
1189		log_warnx("permission denied");
1190		errno = EPERM;
1191		goto fail;
1192	}
1193	if (vmc->vmc_flags == 0) {
1194		log_warnx("invalid configuration, no devices");
1195		errno = VMD_DISK_MISSING;
1196		goto fail;
1197	}
1198	if (vcp->vcp_ncpus == 0)
1199		vcp->vcp_ncpus = 1;
1200	if (vcp->vcp_memranges[0].vmr_size == 0)
1201		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1202	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1203		log_warnx("invalid number of CPUs");
1204		goto fail;
1205	} else if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) {
1206		log_warnx("invalid number of disks");
1207		goto fail;
1208	} else if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) {
1209		log_warnx("invalid number of interfaces");
1210		goto fail;
1211	} else if (strlen(vcp->vcp_kernel) == 0 &&
1212	    vcp->vcp_ndisks == 0 && strlen(vcp->vcp_cdrom) == 0) {
1213		log_warnx("no kernel or disk/cdrom specified");
1214		goto fail;
1215	} else if (strlen(vcp->vcp_name) == 0) {
1216		log_warnx("invalid VM name");
1217		goto fail;
1218	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1219	    *vcp->vcp_name == '_') {
1220		log_warnx("invalid VM name");
1221		goto fail;
1222	} else {
1223		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1224			if (!(isalnum(*s) || *s == '.' || *s == '-' ||
1225			    *s == '_')) {
1226				log_warnx("invalid VM name");
1227				goto fail;
1228			}
1229		}
1230	}
1231
1232	/* track active users */
1233	if (uid != 0 && env->vmd_users != NULL &&
1234	    (usr = user_get(uid)) == NULL) {
1235		log_warnx("could not add user");
1236		goto fail;
1237	}
1238
1239	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1240		goto fail;
1241
1242	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1243	vmc = &vm->vm_params;
1244	vcp = &vmc->vmc_params;
1245	vm->vm_pid = -1;
1246	vm->vm_tty = -1;
1247	vm->vm_receive_fd = -1;
1248	vm->vm_paused = 0;
1249	vm->vm_user = usr;
1250
1251	for (i = 0; i < vcp->vcp_ndisks; i++)
1252		vm->vm_disks[i] = -1;
1253	for (i = 0; i < vcp->vcp_nnics; i++) {
1254		vm->vm_ifs[i].vif_fd = -1;
1255
1256		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1257			/* inherit per-interface flags from the switch */
1258			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1259		}
1260
1261		/*
1262		 * If the MAC address is zero, always randomize it in vmd(8)
1263		 * because we cannot rely on the guest OS to do the right
1264		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1265		 * from the kernel, incremented by one to differentiate
1266		 * the source.
1267		 */
1268		if (memcmp(zero_mac, &vcp->vcp_macs[i], ETHER_ADDR_LEN) == 0) {
1269			rng = arc4random();
1270			vcp->vcp_macs[i][0] = 0xfe;
1271			vcp->vcp_macs[i][1] = 0xe1;
1272			vcp->vcp_macs[i][2] = 0xba + 1;
1273			vcp->vcp_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1274			vcp->vcp_macs[i][4] = rng;
1275			vcp->vcp_macs[i][5] = rng >> 8;
1276		}
1277	}
1278	vm->vm_kernel = -1;
1279	vm->vm_cdrom = -1;
1280	vm->vm_iev.ibuf.fd = -1;
1281
1282	if (++env->vmd_nvm == 0)
1283		fatalx("too many vms");
1284
1285	/* Assign a new internal Id if not specified */
1286	vm->vm_vmid = id == 0 ? env->vmd_nvm : id;
1287
1288	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1289	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1290
1291	*ret_vm = vm;
1292	return (0);
1293 fail:
1294	if (errno == 0)
1295		errno = EINVAL;
1296	return (-1);
1297}
1298
1299int
1300vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1301    struct vmop_create_params *vmc, uid_t uid)
1302{
1303	char			*name;
1304	struct vm_create_params	*vcp = &vmc->vmc_params;
1305	struct vmop_create_params *vmcp;
1306	struct vm_create_params	*vcpp;
1307	struct vmd_vm		*vm = NULL;
1308	unsigned int		 i, j;
1309	uint32_t		 id;
1310
1311	/* return without error if the parent is NULL (nothing to inherit) */
1312	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1313	    (*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL)
1314		return (0);
1315
1316	errno = 0;
1317	vmcp = &(*vm_parent)->vm_params;
1318	vcpp = &vmcp->vmc_params;
1319
1320	/* Are we allowed to create an instance from this VM? */
1321	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1322		log_warnx("vm \"%s\" no permission to create vm instance",
1323		    vcpp->vcp_name);
1324		errno = ENAMETOOLONG;
1325		return (-1);
1326	}
1327
1328	id = vcp->vcp_id;
1329	name = vcp->vcp_name;
1330
1331	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1332	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1333		errno = EPROCLIM;
1334		return (-1);
1335	}
1336
1337	/* CPU */
1338	if (vcp->vcp_ncpus == 0)
1339		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1340	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1341	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1342		log_warnx("vm \"%s\" no permission to set cpus", name);
1343		errno = EPERM;
1344		return (-1);
1345	}
1346
1347	/* memory */
1348	if (vcp->vcp_memranges[0].vmr_size == 0)
1349		vcp->vcp_memranges[0].vmr_size =
1350		    vcpp->vcp_memranges[0].vmr_size;
1351	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1352	    vcp->vcp_memranges[0].vmr_size !=
1353	    vcpp->vcp_memranges[0].vmr_size) {
1354		log_warnx("vm \"%s\" no permission to set memory", name);
1355		errno = EPERM;
1356		return (-1);
1357	}
1358
1359	/* disks cannot be inherited */
1360	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1361	    vcp->vcp_ndisks) {
1362		log_warnx("vm \"%s\" no permission to set disks", name);
1363		errno = EPERM;
1364		return (-1);
1365	}
1366	for (i = 0; i < vcp->vcp_ndisks; i++) {
1367		/* Check if this disk is already used in the parent */
1368		for (j = 0; j < vcpp->vcp_ndisks; j++) {
1369			if (strcmp(vcp->vcp_disks[i],
1370			    vcpp->vcp_disks[j]) == 0) {
1371				log_warnx("vm \"%s\" disk %s cannot be reused",
1372				    name, vcp->vcp_disks[i]);
1373				errno = EBUSY;
1374				return (-1);
1375			}
1376		}
1377		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1378	}
1379
1380	/* interfaces */
1381	if (vcp->vcp_nnics > 0 &&
1382	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1383	    vcp->vcp_nnics != vcpp->vcp_nnics) {
1384		log_warnx("vm \"%s\" no permission to set interfaces", name);
1385		errno = EPERM;
1386		return (-1);
1387	}
1388	for (i = 0; i < vcpp->vcp_nnics; i++) {
1389		/* Interface got overwritten */
1390		if (i < vcp->vcp_nnics)
1391			continue;
1392
1393		/* Copy interface from parent */
1394		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1395		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1396		    sizeof(vmc->vmc_ifnames[i]));
1397		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1398		    sizeof(vmc->vmc_ifswitch[i]));
1399		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1400		    sizeof(vmc->vmc_ifgroup[i]));
1401		memcpy(vcp->vcp_macs[i], vcpp->vcp_macs[i],
1402		    sizeof(vcp->vcp_macs[i]));
1403		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1404		vcp->vcp_nnics++;
1405	}
1406	for (i = 0; i < vcp->vcp_nnics; i++) {
1407		for (j = 0; j < vcpp->vcp_nnics; j++) {
1408			if (memcmp(zero_mac, vcp->vcp_macs[i],
1409			    sizeof(vcp->vcp_macs[i])) != 0 &&
1410			    memcmp(vcpp->vcp_macs[i], vcp->vcp_macs[i],
1411			    sizeof(vcp->vcp_macs[i])) != 0) {
1412				log_warnx("vm \"%s\" lladdr cannot be reused",
1413				    name);
1414				errno = EBUSY;
1415				return (-1);
1416			}
1417			if (strlen(vmc->vmc_ifnames[i]) &&
1418			    strcmp(vmc->vmc_ifnames[i],
1419			    vmcp->vmc_ifnames[j]) == 0) {
1420				log_warnx("vm \"%s\" %s cannot be reused",
1421				    vmc->vmc_ifnames[i], name);
1422				errno = EBUSY;
1423				return (-1);
1424			}
1425		}
1426	}
1427
1428	/* kernel */
1429	if (strlen(vcp->vcp_kernel) > 0) {
1430		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1431			log_warnx("vm \"%s\" no permission to set boot image",
1432			    name);
1433			errno = EPERM;
1434			return (-1);
1435		}
1436		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1437	} else if (strlcpy(vcp->vcp_kernel, vcpp->vcp_kernel,
1438	    sizeof(vcp->vcp_kernel)) >= sizeof(vcp->vcp_kernel)) {
1439		log_warnx("vm \"%s\" kernel name too long", name);
1440		errno = EINVAL;
1441		return (-1);
1442	}
1443
1444	/* cdrom */
1445	if (strlen(vcp->vcp_cdrom) > 0) {
1446		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1447			log_warnx("vm \"%s\" no permission to set cdrom", name);
1448			errno = EPERM;
1449			return (-1);
1450		}
1451		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1452	} else if (strlcpy(vcp->vcp_cdrom, vcpp->vcp_cdrom,
1453	    sizeof(vcp->vcp_cdrom)) >= sizeof(vcp->vcp_cdrom)) {
1454		log_warnx("vm \"%s\" cdrom name too long", name);
1455		errno = EINVAL;
1456		return (-1);
1457	}
1458
1459	/* user */
1460	if (vmc->vmc_owner.uid == 0)
1461		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1462	else if (vmc->vmc_owner.uid != uid &&
1463	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1464		log_warnx("vm \"%s\" user mismatch", name);
1465		errno = EPERM;
1466		return (-1);
1467	}
1468
1469	/* group */
1470	if (vmc->vmc_owner.gid == 0)
1471		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1472	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1473		log_warnx("vm \"%s\" group mismatch", name);
1474		errno = EPERM;
1475		return (-1);
1476	}
1477
1478	/* child instances */
1479	if (vmc->vmc_insflags) {
1480		log_warnx("vm \"%s\" cannot change instance permissions", name);
1481		errno = EPERM;
1482		return (-1);
1483	}
1484	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1485		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1486		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1487		vmc->vmc_insflags = vmcp->vmc_insflags;
1488	} else {
1489		vmc->vmc_insowner.gid = 0;
1490		vmc->vmc_insowner.uid = 0;
1491		vmc->vmc_insflags = 0;
1492	}
1493
1494	/* finished, remove instance flags */
1495	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1496
1497	return (0);
1498}
1499
1500/*
1501 * vm_checkperm
1502 *
1503 * Checks if the user represented by the 'uid' parameter is allowed to
1504 * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1505 * console.)
1506 *
1507 * Parameters:
1508 *  vm: the VM whose permission is to be checked
1509 *  vmo: the required uid/gid to be checked
1510 *  uid: the user ID of the user making the request
1511 *
1512 * Return values:
1513 *   0: the permission should be granted
1514 *  -1: the permission check failed (also returned if vm == null)
1515 */
1516int
1517vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1518{
1519	struct group	*gr;
1520	struct passwd	*pw;
1521	char		**grmem;
1522
1523	/* root has no restrictions */
1524	if (uid == 0)
1525		return (0);
1526
1527	if (vmo == NULL)
1528		return (-1);
1529
1530	/* check user */
1531	if (vm == NULL) {
1532		if  (vmo->uid == uid)
1533			return (0);
1534	} else {
1535		/*
1536		 * check user of running vm (the owner of a running vm can
1537		 * be different to (or more specific than) the configured owner.
1538		 */
1539		if ((vm->vm_running && vm->vm_uid == uid) ||
1540		    (!vm->vm_running && vmo->uid == uid))
1541			return (0);
1542	}
1543
1544	/* check groups */
1545	if (vmo->gid != -1) {
1546		if ((pw = getpwuid(uid)) == NULL)
1547			return (-1);
1548		if (pw->pw_gid == vmo->gid)
1549			return (0);
1550		if ((gr = getgrgid(vmo->gid)) != NULL) {
1551			for (grmem = gr->gr_mem; *grmem; grmem++)
1552				if (strcmp(*grmem, pw->pw_name) == 0)
1553					return (0);
1554		}
1555	}
1556
1557	return (-1);
1558}
1559
1560/*
1561 * vm_checkinsflag
1562 *
1563 * Checks wheter the non-root user is allowed to set an instance option.
1564 *
1565 * Parameters:
1566 *  vmc: the VM create parameters
1567 *  flag: the flag to be checked
1568 *  uid: the user ID of the user making the request
1569 *
1570 * Return values:
1571 *   0: the permission should be granted
1572 *  -1: the permission check failed (also returned if vm == null)
1573 */
1574int
1575vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1576{
1577	/* root has no restrictions */
1578	if (uid == 0)
1579		return (0);
1580
1581	if ((vmc->vmc_insflags & flag) == 0)
1582		return (-1);
1583
1584	return (0);
1585}
1586
1587/*
1588 * vm_checkaccess
1589 *
1590 * Checks if the user represented by the 'uid' parameter is allowed to
1591 * access the file described by the 'path' parameter.
1592 *
1593 * Parameters:
1594 *  fd: the file descriptor of the opened file
1595 *  uflag: check if the userid has access to the file
1596 *  uid: the user ID of the user making the request
1597 *  amode: the access flags of R_OK and W_OK
1598 *
1599 * Return values:
1600 *   0: the permission should be granted
1601 *  -1: the permission check failed
1602 */
1603int
1604vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1605{
1606	struct group	*gr;
1607	struct passwd	*pw;
1608	char		**grmem;
1609	struct stat	 st;
1610	mode_t		 mode;
1611
1612	if (fd == -1)
1613		return (-1);
1614
1615	/*
1616	 * File has to be accessible and a regular file
1617	 */
1618	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1619		return (-1);
1620
1621	/* root has no restrictions */
1622	if (uid == 0 || uflag == 0)
1623		return (0);
1624
1625	/* check other */
1626	mode = amode & W_OK ? S_IWOTH : 0;
1627	mode |= amode & R_OK ? S_IROTH : 0;
1628	if ((st.st_mode & mode) == mode)
1629		return (0);
1630
1631	/* check user */
1632	mode = amode & W_OK ? S_IWUSR : 0;
1633	mode |= amode & R_OK ? S_IRUSR : 0;
1634	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1635		return (0);
1636
1637	/* check groups */
1638	mode = amode & W_OK ? S_IWGRP : 0;
1639	mode |= amode & R_OK ? S_IRGRP : 0;
1640	if ((st.st_mode & mode) != mode)
1641		return (-1);
1642	if ((pw = getpwuid(uid)) == NULL)
1643		return (-1);
1644	if (pw->pw_gid == st.st_gid)
1645		return (0);
1646	if ((gr = getgrgid(st.st_gid)) != NULL) {
1647		for (grmem = gr->gr_mem; *grmem; grmem++)
1648			if (strcmp(*grmem, pw->pw_name) == 0)
1649				return (0);
1650	}
1651
1652	return (-1);
1653}
1654
1655int
1656vm_opentty(struct vmd_vm *vm)
1657{
1658	struct ptmget		 ptm;
1659	struct stat		 st;
1660	struct group		*gr;
1661	uid_t			 uid;
1662	gid_t			 gid;
1663	mode_t			 mode;
1664	int			 on;
1665
1666	/*
1667	 * Open tty with pre-opened PTM fd
1668	 */
1669	if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1))
1670		return (-1);
1671
1672	/*
1673	 * We use user ioctl(2) mode to pass break commands.
1674	 */
1675	on = 1;
1676	if (ioctl(ptm.cfd, TIOCUCNTL, &on))
1677		fatal("could not enable user ioctl mode");
1678
1679	vm->vm_tty = ptm.cfd;
1680	close(ptm.sfd);
1681	if ((vm->vm_ttyname = strdup(ptm.sn)) == NULL)
1682		goto fail;
1683
1684	uid = vm->vm_uid;
1685	gid = vm->vm_params.vmc_owner.gid;
1686
1687	if (vm->vm_params.vmc_owner.gid != -1) {
1688		mode = 0660;
1689	} else if ((gr = getgrnam("tty")) != NULL) {
1690		gid = gr->gr_gid;
1691		mode = 0620;
1692	} else {
1693		mode = 0600;
1694		gid = 0;
1695	}
1696
1697	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1698	    __func__, vm->vm_params.vmc_params.vcp_name,
1699	    vm->vm_ttyname, uid, gid, mode);
1700
1701	/*
1702	 * Change ownership and mode of the tty as required.
1703	 * Loosely based on the implementation of sshpty.c
1704	 */
1705	if (stat(vm->vm_ttyname, &st) == -1)
1706		goto fail;
1707
1708	if (st.st_uid != uid || st.st_gid != gid) {
1709		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1710			log_warn("chown %s %d %d failed, uid %d",
1711			    vm->vm_ttyname, uid, gid, getuid());
1712
1713			/* Ignore failure on read-only filesystems */
1714			if (!((errno == EROFS) &&
1715			    (st.st_uid == uid || st.st_uid == 0)))
1716				goto fail;
1717		}
1718	}
1719
1720	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1721		if (chmod(vm->vm_ttyname, mode) == -1) {
1722			log_warn("chmod %s %o failed, uid %d",
1723			    vm->vm_ttyname, mode, getuid());
1724
1725			/* Ignore failure on read-only filesystems */
1726			if (!((errno == EROFS) &&
1727			    (st.st_uid == uid || st.st_uid == 0)))
1728				goto fail;
1729		}
1730	}
1731
1732	return (0);
1733 fail:
1734	vm_closetty(vm);
1735	return (-1);
1736}
1737
1738void
1739vm_closetty(struct vmd_vm *vm)
1740{
1741	if (vm->vm_tty != -1) {
1742		/* Release and close the tty */
1743		if (fchown(vm->vm_tty, 0, 0) == -1)
1744			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1745		if (fchmod(vm->vm_tty, 0666) == -1)
1746			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1747		close(vm->vm_tty);
1748		vm->vm_tty = -1;
1749	}
1750	free(vm->vm_ttyname);
1751	vm->vm_ttyname = NULL;
1752}
1753
1754void
1755switch_remove(struct vmd_switch *vsw)
1756{
1757	if (vsw == NULL)
1758		return;
1759
1760	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1761
1762	free(vsw->sw_group);
1763	free(vsw->sw_name);
1764	free(vsw);
1765}
1766
1767struct vmd_switch *
1768switch_getbyname(const char *name)
1769{
1770	struct vmd_switch	*vsw;
1771
1772	if (name == NULL)
1773		return (NULL);
1774	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1775		if (strcmp(vsw->sw_name, name) == 0)
1776			return (vsw);
1777	}
1778
1779	return (NULL);
1780}
1781
1782struct vmd_user *
1783user_get(uid_t uid)
1784{
1785	struct vmd_user		*usr;
1786
1787	if (uid == 0)
1788		return (NULL);
1789
1790	/* first try to find an existing user */
1791	TAILQ_FOREACH(usr, env->vmd_users, usr_entry) {
1792		if (usr->usr_id.uid == uid)
1793			goto done;
1794	}
1795
1796	if ((usr = calloc(1, sizeof(*usr))) == NULL) {
1797		log_warn("could not allocate user");
1798		return (NULL);
1799	}
1800
1801	usr->usr_id.uid = uid;
1802	usr->usr_id.gid = -1;
1803	TAILQ_INSERT_TAIL(env->vmd_users, usr, usr_entry);
1804
1805 done:
1806	DPRINTF("%s: uid %d #%d +",
1807	    __func__, usr->usr_id.uid, usr->usr_refcnt + 1);
1808	usr->usr_refcnt++;
1809
1810	return (usr);
1811}
1812
1813void
1814user_put(struct vmd_user *usr)
1815{
1816	if (usr == NULL)
1817		return;
1818
1819	DPRINTF("%s: uid %d #%d -",
1820	    __func__, usr->usr_id.uid, usr->usr_refcnt - 1);
1821
1822	if (--usr->usr_refcnt > 0)
1823		return;
1824
1825	TAILQ_REMOVE(env->vmd_users, usr, usr_entry);
1826	free(usr);
1827}
1828
1829void
1830user_inc(struct vm_create_params *vcp, struct vmd_user *usr, int inc)
1831{
1832	char	 mem[FMT_SCALED_STRSIZE];
1833
1834	if (usr == NULL)
1835		return;
1836
1837	/* increment or decrement counters */
1838	inc = inc ? 1 : -1;
1839
1840	usr->usr_maxcpu += vcp->vcp_ncpus * inc;
1841	usr->usr_maxmem += vcp->vcp_memranges[0].vmr_size * inc;
1842	usr->usr_maxifs += vcp->vcp_nnics * inc;
1843
1844	if (log_getverbose() > 1) {
1845		(void)fmt_scaled(usr->usr_maxmem * 1024 * 1024, mem);
1846		log_debug("%s: %c uid %d ref %d cpu %llu mem %s ifs %llu",
1847		    __func__, inc == 1 ? '+' : '-',
1848		    usr->usr_id.uid, usr->usr_refcnt,
1849		    usr->usr_maxcpu, mem, usr->usr_maxifs);
1850	}
1851}
1852
1853int
1854user_checklimit(struct vmd_user *usr, struct vm_create_params *vcp)
1855{
1856	const char	*limit = "";
1857
1858	/* XXX make the limits configurable */
1859	if (usr->usr_maxcpu > VM_DEFAULT_USER_MAXCPU) {
1860		limit = "cpu ";
1861		goto fail;
1862	}
1863	if (usr->usr_maxcpu > VM_DEFAULT_USER_MAXMEM) {
1864		limit = "memory ";
1865		goto fail;
1866	}
1867	if (usr->usr_maxifs > VM_DEFAULT_USER_MAXIFS) {
1868		limit = "interface ";
1869		goto fail;
1870	}
1871
1872	return (0);
1873
1874 fail:
1875	log_warnx("%s: user %d %slimit reached", vcp->vcp_name,
1876	    usr->usr_id.uid, limit);
1877	return (-1);
1878}
1879
1880char *
1881get_string(uint8_t *ptr, size_t len)
1882{
1883	size_t	 i;
1884
1885	for (i = 0; i < len; i++)
1886		if (!isprint(ptr[i]))
1887			break;
1888
1889	return strndup(ptr, i);
1890}
1891
1892uint32_t
1893prefixlen2mask(uint8_t prefixlen)
1894{
1895	if (prefixlen == 0)
1896		return (0);
1897
1898	if (prefixlen > 32)
1899		prefixlen = 32;
1900
1901	return (htonl(0xffffffff << (32 - prefixlen)));
1902}
1903