vmd.c revision 1.105
1/*	$OpenBSD: vmd.c,v 1.105 2018/11/21 12:31:47 reyk Exp $	*/
2
3/*
4 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/param.h>	/* nitems */
20#include <sys/queue.h>
21#include <sys/wait.h>
22#include <sys/cdefs.h>
23#include <sys/stat.h>
24#include <sys/tty.h>
25#include <sys/ttycom.h>
26#include <sys/ioctl.h>
27
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31#include <termios.h>
32#include <errno.h>
33#include <event.h>
34#include <fcntl.h>
35#include <pwd.h>
36#include <signal.h>
37#include <syslog.h>
38#include <unistd.h>
39#include <util.h>
40#include <ctype.h>
41#include <pwd.h>
42#include <grp.h>
43
44#include <machine/specialreg.h>
45#include <machine/vmmvar.h>
46
47#include "proc.h"
48#include "atomicio.h"
49#include "vmd.h"
50
51__dead void usage(void);
52
53int	 main(int, char **);
54int	 vmd_configure(void);
55void	 vmd_sighdlr(int sig, short event, void *arg);
56void	 vmd_shutdown(void);
57int	 vmd_control_run(void);
58int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
59int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
60int	 vmd_check_vmh(struct vm_dump_header *);
61
62int	 vm_instance(struct privsep *, struct vmd_vm **,
63	    struct vmop_create_params *, uid_t);
64int	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
65
66struct vmd	*env;
67
68static struct privsep_proc procs[] = {
69	/* Keep "priv" on top as procs[0] */
70	{ "priv",	PROC_PRIV,	NULL, priv },
71	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
72	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm, vmm_shutdown },
73};
74
75/* For the privileged process */
76static struct privsep_proc *proc_priv = &procs[0];
77static struct passwd proc_privpw;
78static const uint8_t zero_mac[ETHER_ADDR_LEN];
79
80int
81vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
82{
83	struct privsep			*ps = p->p_ps;
84	int				 res = 0, ret = 0, cmd = 0, verbose;
85	unsigned int			 v = 0, flags;
86	struct vmop_create_params	 vmc;
87	struct vmop_id			 vid;
88	struct vmop_result		 vmr;
89	struct vm_dump_header		 vmh;
90	struct vmd_vm			*vm = NULL;
91	char				*str = NULL;
92	uint32_t			 id = 0;
93	struct control_sock		*rcs;
94
95	switch (imsg->hdr.type) {
96	case IMSG_VMDOP_START_VM_REQUEST:
97		IMSG_SIZE_CHECK(imsg, &vmc);
98		memcpy(&vmc, imsg->data, sizeof(vmc));
99		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
100		if (vmc.vmc_flags == 0) {
101			/* start an existing VM with pre-configured options */
102			if (!(ret == -1 && errno == EALREADY &&
103			    vm->vm_running == 0)) {
104				res = errno;
105				cmd = IMSG_VMDOP_START_VM_RESPONSE;
106			}
107		} else if (ret != 0) {
108			res = errno;
109			cmd = IMSG_VMDOP_START_VM_RESPONSE;
110		}
111		if (res == 0 &&
112		    config_setvm(ps, vm,
113		    imsg->hdr.peerid, vm->vm_params.vmc_owner.uid) == -1) {
114			res = errno;
115			cmd = IMSG_VMDOP_START_VM_RESPONSE;
116		}
117		break;
118	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
119		IMSG_SIZE_CHECK(imsg, &vid);
120		memcpy(&vid, imsg->data, sizeof(vid));
121		flags = vid.vid_flags;
122
123		if ((id = vid.vid_id) == 0) {
124			/* Lookup vm (id) by name */
125			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
126				res = ENOENT;
127				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
128				break;
129			} else if (vm->vm_shutdown &&
130			    (flags & VMOP_FORCE) == 0) {
131				res = EALREADY;
132				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
133				break;
134			} else if (vm->vm_running == 0) {
135				res = EINVAL;
136				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
137				break;
138			}
139			id = vm->vm_vmid;
140		} else if ((vm = vm_getbyvmid(id)) == NULL) {
141			res = ENOENT;
142			cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
143			break;
144		}
145		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
146		    vid.vid_uid) != 0) {
147			res = EPERM;
148			cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
149			break;
150		}
151
152		memset(&vid, 0, sizeof(vid));
153		vid.vid_id = id;
154		vid.vid_flags = flags;
155		if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
156		    imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
157			return (-1);
158		break;
159	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
160		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
161		break;
162	case IMSG_VMDOP_LOAD:
163		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
164		str = get_string((uint8_t *)imsg->data,
165		    IMSG_DATA_SIZE(imsg));
166	case IMSG_VMDOP_RELOAD:
167		if (vmd_reload(0, str) == -1)
168			cmd = IMSG_CTL_FAIL;
169		else
170			cmd = IMSG_CTL_OK;
171		free(str);
172		break;
173	case IMSG_CTL_RESET:
174		IMSG_SIZE_CHECK(imsg, &v);
175		memcpy(&v, imsg->data, sizeof(v));
176		if (vmd_reload(v, NULL) == -1)
177			cmd = IMSG_CTL_FAIL;
178		else
179			cmd = IMSG_CTL_OK;
180		break;
181	case IMSG_CTL_VERBOSE:
182		IMSG_SIZE_CHECK(imsg, &verbose);
183		memcpy(&verbose, imsg->data, sizeof(verbose));
184		log_setverbose(verbose);
185
186		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
187		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
188		cmd = IMSG_CTL_OK;
189		break;
190	case IMSG_VMDOP_PAUSE_VM:
191	case IMSG_VMDOP_UNPAUSE_VM:
192		IMSG_SIZE_CHECK(imsg, &vid);
193		memcpy(&vid, imsg->data, sizeof(vid));
194		if (vid.vid_id == 0) {
195			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
196				res = ENOENT;
197				cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
198				break;
199			} else {
200				vid.vid_id = vm->vm_vmid;
201			}
202		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
203			res = ENOENT;
204			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
205			break;
206		}
207		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
208		    vid.vid_uid) != 0) {
209			res = EPERM;
210			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
211			break;
212		}
213		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
214		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
215		break;
216	case IMSG_VMDOP_SEND_VM_REQUEST:
217		IMSG_SIZE_CHECK(imsg, &vid);
218		memcpy(&vid, imsg->data, sizeof(vid));
219		id = vid.vid_id;
220		if (vid.vid_id == 0) {
221			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
222				res = ENOENT;
223				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
224				close(imsg->fd);
225				break;
226			} else {
227				vid.vid_id = vm->vm_vmid;
228			}
229		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
230			res = ENOENT;
231			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
232			close(imsg->fd);
233			break;
234		} else {
235		}
236		vmr.vmr_id = vid.vid_id;
237		log_debug("%s: sending fd to vmm", __func__);
238		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
239		    imsg->hdr.peerid, imsg->fd, &vid, sizeof(vid));
240		break;
241	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
242		IMSG_SIZE_CHECK(imsg, &vid);
243		memcpy(&vid, imsg->data, sizeof(vid));
244		if (imsg->fd == -1) {
245			log_warnx("%s: invalid fd", __func__);
246			return (-1);
247		}
248		if (atomicio(read, imsg->fd, &vmh, sizeof(vmh)) !=
249		    sizeof(vmh)) {
250			log_warnx("%s: error reading vmh from received vm",
251			    __func__);
252			res = EIO;
253			close(imsg->fd);
254			cmd = IMSG_VMDOP_START_VM_RESPONSE;
255			break;
256		}
257
258		if (vmd_check_vmh(&vmh)) {
259			res = ENOENT;
260			close(imsg->fd);
261			cmd = IMSG_VMDOP_START_VM_RESPONSE;
262			break;
263		}
264		if (atomicio(read, imsg->fd, &vmc, sizeof(vmc)) !=
265		    sizeof(vmc)) {
266			log_warnx("%s: error reading vmc from received vm",
267			    __func__);
268			res = EIO;
269			close(imsg->fd);
270			cmd = IMSG_VMDOP_START_VM_RESPONSE;
271			break;
272		}
273		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
274		    sizeof(vmc.vmc_params.vcp_name));
275		vmc.vmc_params.vcp_id = 0;
276
277		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
278		if (ret != 0) {
279			res = errno;
280			cmd = IMSG_VMDOP_START_VM_RESPONSE;
281			close(imsg->fd);
282		} else {
283			vm->vm_received = 1;
284			config_setvm(ps, vm, imsg->hdr.peerid,
285			    vmc.vmc_owner.uid);
286			log_debug("%s: sending fd to vmm", __func__);
287			proc_compose_imsg(ps, PROC_VMM, -1,
288			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, imsg->fd,
289			    NULL, 0);
290		}
291		break;
292	case IMSG_VMDOP_DONE:
293		control_reset(&ps->ps_csock);
294		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
295			control_reset(rcs);
296		cmd = 0;
297		break;
298	default:
299		return (-1);
300	}
301
302	switch (cmd) {
303	case 0:
304		break;
305	case IMSG_VMDOP_START_VM_RESPONSE:
306	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
307		memset(&vmr, 0, sizeof(vmr));
308		vmr.vmr_result = res;
309		vmr.vmr_id = id;
310		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
311		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
312			return (-1);
313		break;
314	default:
315		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
316		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
317			return (-1);
318		break;
319	}
320
321	return (0);
322}
323
324int
325vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
326{
327	struct vmop_result	 vmr;
328	struct privsep		*ps = p->p_ps;
329	int			 res = 0;
330	struct vmd_vm		*vm;
331	struct vm_create_params	*vcp;
332	struct vmop_info_result	 vir;
333
334	switch (imsg->hdr.type) {
335	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
336		IMSG_SIZE_CHECK(imsg, &vmr);
337		memcpy(&vmr, imsg->data, sizeof(vmr));
338		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
339			break;
340		proc_compose_imsg(ps, PROC_CONTROL, -1,
341		    imsg->hdr.type, imsg->hdr.peerid, -1,
342		    imsg->data, sizeof(imsg->data));
343		log_info("%s: paused vm %d successfully",
344		    vm->vm_params.vmc_params.vcp_name,
345		    vm->vm_vmid);
346		break;
347	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
348		IMSG_SIZE_CHECK(imsg, &vmr);
349		memcpy(&vmr, imsg->data, sizeof(vmr));
350		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
351			break;
352		proc_compose_imsg(ps, PROC_CONTROL, -1,
353		    imsg->hdr.type, imsg->hdr.peerid, -1,
354		    imsg->data, sizeof(imsg->data));
355		log_info("%s: unpaused vm %d successfully.",
356		    vm->vm_params.vmc_params.vcp_name,
357		    vm->vm_vmid);
358		break;
359	case IMSG_VMDOP_START_VM_RESPONSE:
360		IMSG_SIZE_CHECK(imsg, &vmr);
361		memcpy(&vmr, imsg->data, sizeof(vmr));
362		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
363			break;
364		vm->vm_pid = vmr.vmr_pid;
365		vcp = &vm->vm_params.vmc_params;
366		vcp->vcp_id = vmr.vmr_id;
367
368		/*
369		 * If the peerid is not -1, forward the response back to the
370		 * the control socket.  If it is -1, the request originated
371		 * from the parent, not the control socket.
372		 */
373		if (vm->vm_peerid != (uint32_t)-1) {
374			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
375			    sizeof(vmr.vmr_ttyname));
376			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
377			    imsg->hdr.type, vm->vm_peerid, -1,
378			    &vmr, sizeof(vmr)) == -1) {
379				errno = vmr.vmr_result;
380				log_warn("%s: failed to foward vm result",
381				    vcp->vcp_name);
382				vm_remove(vm, __func__);
383				return (-1);
384			}
385		}
386
387		if (vmr.vmr_result) {
388			errno = vmr.vmr_result;
389			log_warn("%s: failed to start vm", vcp->vcp_name);
390			vm_remove(vm, __func__);
391			break;
392		}
393
394		/* Now configure all the interfaces */
395		if (vm_priv_ifconfig(ps, vm) == -1) {
396			log_warn("%s: failed to configure vm", vcp->vcp_name);
397			vm_remove(vm, __func__);
398			break;
399		}
400
401		log_info("%s: started vm %d successfully, tty %s",
402		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
403		break;
404	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
405		IMSG_SIZE_CHECK(imsg, &vmr);
406		memcpy(&vmr, imsg->data, sizeof(vmr));
407		DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
408		    __func__, vmr.vmr_id);
409		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
410		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
411			break;
412		if (vmr.vmr_result == 0) {
413			/* Mark VM as shutting down */
414			vm->vm_shutdown = 1;
415		}
416		break;
417	case IMSG_VMDOP_SEND_VM_RESPONSE:
418		IMSG_SIZE_CHECK(imsg, &vmr);
419		memcpy(&vmr, imsg->data, sizeof(vmr));
420		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
421			break;
422		if (!vmr.vmr_result) {
423			log_info("%s: sent vm %d successfully.",
424			    vm->vm_params.vmc_params.vcp_name,
425			    vm->vm_vmid);
426			if (vm->vm_from_config)
427				vm_stop(vm, 0, __func__);
428			else
429				vm_remove(vm, __func__);
430		}
431
432		/* Send a response if a control client is waiting for it */
433		if (imsg->hdr.peerid != (uint32_t)-1) {
434			/* the error is meaningless for deferred responses */
435			vmr.vmr_result = 0;
436
437			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
438			    IMSG_VMDOP_SEND_VM_RESPONSE,
439			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
440				return (-1);
441		}
442		break;
443	case IMSG_VMDOP_TERMINATE_VM_EVENT:
444		IMSG_SIZE_CHECK(imsg, &vmr);
445		memcpy(&vmr, imsg->data, sizeof(vmr));
446		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
447		    __func__, vmr.vmr_id, vmr.vmr_result);
448		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
449			log_debug("%s: vm %d is no longer available",
450			    __func__, vmr.vmr_id);
451			break;
452		}
453		if (vmr.vmr_result != EAGAIN) {
454			if (vm->vm_from_config)
455				vm_stop(vm, 0, __func__);
456			else
457				vm_remove(vm, __func__);
458		} else {
459			/* Stop VM instance but keep the tty open */
460			vm_stop(vm, 1, __func__);
461			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
462		}
463
464		/* Send a response if a control client is waiting for it */
465		if (imsg->hdr.peerid != (uint32_t)-1) {
466			/* the error is meaningless for deferred responses */
467			vmr.vmr_result = 0;
468
469			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
470			    IMSG_VMDOP_TERMINATE_VM_RESPONSE,
471			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
472				return (-1);
473		}
474		break;
475	case IMSG_VMDOP_GET_INFO_VM_DATA:
476		IMSG_SIZE_CHECK(imsg, &vir);
477		memcpy(&vir, imsg->data, sizeof(vir));
478		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
479			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
480			if (vm->vm_ttyname != NULL)
481				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
482				    sizeof(vir.vir_ttyname));
483			if (vm->vm_shutdown) {
484				/* XXX there might be a nicer way */
485				(void)strlcat(vir.vir_info.vir_name,
486				    " - stopping",
487				    sizeof(vir.vir_info.vir_name));
488			}
489			/* get the user id who started the vm */
490			vir.vir_uid = vm->vm_uid;
491			vir.vir_gid = vm->vm_params.vmc_owner.gid;
492		}
493		if (proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type,
494		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
495			log_debug("%s: GET_INFO_VM failed for vm %d, removing",
496			    __func__, vm->vm_vmid);
497			vm_remove(vm, __func__);
498			return (-1);
499		}
500		break;
501	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
502		/*
503		 * PROC_VMM has responded with the *running* VMs, now we
504		 * append the others. These use the special value 0 for their
505		 * kernel id to indicate that they are not running.
506		 */
507		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
508			if (!vm->vm_running) {
509				memset(&vir, 0, sizeof(vir));
510				vir.vir_info.vir_id = vm->vm_vmid;
511				strlcpy(vir.vir_info.vir_name,
512				    vm->vm_params.vmc_params.vcp_name,
513				    VMM_MAX_NAME_LEN);
514				vir.vir_info.vir_memory_size =
515				    vm->vm_params.vmc_params.
516				    vcp_memranges[0].vmr_size;
517				vir.vir_info.vir_ncpus =
518				    vm->vm_params.vmc_params.vcp_ncpus;
519				/* get the configured user id for this vm */
520				vir.vir_uid = vm->vm_params.vmc_owner.uid;
521				vir.vir_gid = vm->vm_params.vmc_owner.gid;
522				if (proc_compose_imsg(ps, PROC_CONTROL, -1,
523				    IMSG_VMDOP_GET_INFO_VM_DATA,
524				    imsg->hdr.peerid, -1, &vir,
525				    sizeof(vir)) == -1) {
526					log_debug("%s: GET_INFO_VM_END failed",
527					    __func__);
528					vm_remove(vm, __func__);
529					return (-1);
530				}
531			}
532		}
533		IMSG_SIZE_CHECK(imsg, &res);
534		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
535		break;
536	default:
537		return (-1);
538	}
539
540	return (0);
541}
542
543int
544vmd_check_vmh(struct vm_dump_header *vmh)
545{
546	int i;
547	unsigned int code, leaf;
548	unsigned int a, b, c, d;
549
550
551	if (vmh->vmh_version != VM_DUMP_VERSION) {
552		log_warnx("%s: incompatible dump version", __func__);
553		return (-1);
554	}
555
556	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
557		code = vmh->vmh_cpuids[i].code;
558		leaf = vmh->vmh_cpuids[i].leaf;
559		if (leaf != 0x00) {
560			log_debug("%s: invalid leaf 0x%x for code 0x%x",
561			    __func__, leaf, code);
562			return (-1);
563		}
564
565		switch (code) {
566		case 0x00:
567			CPUID_LEAF(code, leaf, a, b, c, d);
568			if (vmh->vmh_cpuids[i].a > a) {
569				log_debug("%s: incompatible cpuid level",
570				    __func__);
571				return (-1);
572			}
573			if (!(vmh->vmh_cpuids[i].b == b &&
574			    vmh->vmh_cpuids[i].c == c &&
575			    vmh->vmh_cpuids[i].d == d)) {
576				log_debug("%s: incompatible cpu brand",
577				    __func__);
578				return (-1);
579			}
580			break;
581
582		case 0x01:
583			CPUID_LEAF(code, leaf, a, b, c, d);
584			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
585			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
586				log_debug("%s: incompatible cpu features "
587				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
588				    code, leaf);
589				return (-1);
590			}
591			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
592			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
593				log_debug("%s: incompatible cpu features "
594				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
595				    code, leaf);
596				return (-1);
597			}
598			break;
599
600		case 0x07:
601			CPUID_LEAF(code, leaf, a, b, c, d);
602			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
603			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
604				log_debug("%s: incompatible cpu features "
605				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
606				    code, leaf);
607				return (-1);
608			}
609			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
610			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
611				log_debug("%s: incompatible cpu features "
612				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
613				    code, leaf);
614				return (-1);
615			}
616			break;
617
618		case 0x0d:
619			CPUID_LEAF(code, leaf, a, b, c, d);
620			if (vmh->vmh_cpuids[i].b > b) {
621				log_debug("%s: incompatible cpu: insufficient "
622				    "max save area for enabled XCR0 features",
623				    __func__);
624				return (-1);
625			}
626			if (vmh->vmh_cpuids[i].c > c) {
627				log_debug("%s: incompatible cpu: insufficient "
628				    "max save area for supported XCR0 features",
629				    __func__);
630				return (-1);
631			}
632			break;
633
634		case 0x80000001:
635			CPUID_LEAF(code, leaf, a, b, c, d);
636			if ((vmh->vmh_cpuids[i].a & a) !=
637			    vmh->vmh_cpuids[i].a) {
638				log_debug("%s: incompatible cpu features "
639				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
640				    code, leaf);
641				return (-1);
642			}
643			if ((vmh->vmh_cpuids[i].c & c) !=
644			    vmh->vmh_cpuids[i].c) {
645				log_debug("%s: incompatible cpu features "
646				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
647				    code, leaf);
648				return (-1);
649			}
650			if ((vmh->vmh_cpuids[i].d & d) !=
651			    vmh->vmh_cpuids[i].d) {
652				log_debug("%s: incompatible cpu features "
653				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
654				    code, leaf);
655				return (-1);
656			}
657			break;
658
659		default:
660			log_debug("%s: unknown code 0x%x", __func__, code);
661			return (-1);
662		}
663	}
664
665	return (0);
666}
667
668void
669vmd_sighdlr(int sig, short event, void *arg)
670{
671	if (privsep_process != PROC_PARENT)
672		return;
673	log_debug("%s: handling signal", __func__);
674
675	switch (sig) {
676	case SIGHUP:
677		log_info("%s: reload requested with SIGHUP", __func__);
678
679		/*
680		 * This is safe because libevent uses async signal handlers
681		 * that run in the event loop and not in signal context.
682		 */
683		(void)vmd_reload(0, NULL);
684		break;
685	case SIGPIPE:
686		log_info("%s: ignoring SIGPIPE", __func__);
687		break;
688	case SIGUSR1:
689		log_info("%s: ignoring SIGUSR1", __func__);
690		break;
691	case SIGTERM:
692	case SIGINT:
693		vmd_shutdown();
694		break;
695	default:
696		fatalx("unexpected signal");
697	}
698}
699
700__dead void
701usage(void)
702{
703	extern char *__progname;
704	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
705	    __progname);
706	exit(1);
707}
708
709int
710main(int argc, char **argv)
711{
712	struct privsep		*ps;
713	int			 ch;
714	const char		*conffile = VMD_CONF;
715	enum privsep_procid	 proc_id = PROC_PARENT;
716	int			 proc_instance = 0;
717	const char		*errp, *title = NULL;
718	int			 argc0 = argc;
719
720	log_init(0, LOG_DAEMON);
721
722	if ((env = calloc(1, sizeof(*env))) == NULL)
723		fatal("calloc: env");
724
725	while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) {
726		switch (ch) {
727		case 'D':
728			if (cmdline_symset(optarg) < 0)
729				log_warnx("could not parse macro definition %s",
730				    optarg);
731			break;
732		case 'd':
733			env->vmd_debug = 2;
734			break;
735		case 'f':
736			conffile = optarg;
737			break;
738		case 'v':
739			env->vmd_verbose++;
740			break;
741		case 'n':
742			env->vmd_noaction = 1;
743			break;
744		case 'P':
745			title = optarg;
746			proc_id = proc_getid(procs, nitems(procs), title);
747			if (proc_id == PROC_MAX)
748				fatalx("invalid process name");
749			break;
750		case 'I':
751			proc_instance = strtonum(optarg, 0,
752			    PROC_MAX_INSTANCES, &errp);
753			if (errp)
754				fatalx("invalid process instance");
755			break;
756		default:
757			usage();
758		}
759	}
760
761	argc -= optind;
762	if (argc > 0)
763		usage();
764
765	if (env->vmd_noaction && !env->vmd_debug)
766		env->vmd_debug = 1;
767
768	/* check for root privileges */
769	if (env->vmd_noaction == 0) {
770		if (geteuid())
771			fatalx("need root privileges");
772	}
773
774	ps = &env->vmd_ps;
775	ps->ps_env = env;
776	env->vmd_fd = -1;
777
778	if (config_init(env) == -1)
779		fatal("failed to initialize configuration");
780
781	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
782		fatal("unknown user %s", VMD_USER);
783
784	/* First proc runs as root without pledge but in default chroot */
785	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
786	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
787
788	/* Open /dev/vmm */
789	if (env->vmd_noaction == 0) {
790		env->vmd_fd = open(VMM_NODE, O_RDWR);
791		if (env->vmd_fd == -1)
792			fatal("%s", VMM_NODE);
793	}
794
795	/* Configure the control socket */
796	ps->ps_csock.cs_name = SOCKET_NAME;
797	TAILQ_INIT(&ps->ps_rcsocks);
798
799	/* Configuration will be parsed after forking the children */
800	env->vmd_conffile = conffile;
801
802	log_init(env->vmd_debug, LOG_DAEMON);
803	log_setverbose(env->vmd_verbose);
804
805	if (env->vmd_noaction)
806		ps->ps_noaction = 1;
807	ps->ps_instance = proc_instance;
808	if (title != NULL)
809		ps->ps_title[proc_id] = title;
810
811	/* only the parent returns */
812	proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv,
813	    proc_id);
814
815	log_procinit("parent");
816	if (!env->vmd_debug && daemon(0, 0) == -1)
817		fatal("can't daemonize");
818
819	if (ps->ps_noaction == 0)
820		log_info("startup");
821
822	event_init();
823
824	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
825	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
826	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
827	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
828	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
829
830	signal_add(&ps->ps_evsigint, NULL);
831	signal_add(&ps->ps_evsigterm, NULL);
832	signal_add(&ps->ps_evsighup, NULL);
833	signal_add(&ps->ps_evsigpipe, NULL);
834	signal_add(&ps->ps_evsigusr1, NULL);
835
836	if (!env->vmd_noaction)
837		proc_connect(ps);
838
839	if (vmd_configure() == -1)
840		fatalx("configuration failed");
841
842	event_dispatch();
843
844	log_debug("parent exiting");
845
846	return (0);
847}
848
849int
850vmd_configure(void)
851{
852	struct vmd_vm		*vm;
853	struct vmd_switch	*vsw;
854
855	if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
856		fatal("open %s", PATH_PTMDEV);
857
858	/*
859	 * pledge in the parent process:
860	 * stdio - for malloc and basic I/O including events.
861	 * rpath - for reload to open and read the configuration files.
862	 * wpath - for opening disk images and tap devices.
863	 * tty - for openpty and TIOCUCNTL.
864	 * proc - run kill to terminate its children safely.
865	 * sendfd - for disks, interfaces and other fds.
866	 * recvfd - for send and receive.
867	 * getpw - lookup user or group id by name.
868	 * chown, fattr - change tty ownership
869	 * flock - locking disk files
870	 */
871	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
872	    " chown fattr flock", NULL) == -1)
873		fatal("pledge");
874
875	if (parse_config(env->vmd_conffile) == -1) {
876		proc_kill(&env->vmd_ps);
877		exit(1);
878	}
879
880	if (env->vmd_noaction) {
881		fprintf(stderr, "configuration OK\n");
882		proc_kill(&env->vmd_ps);
883		exit(0);
884	}
885
886	/* Send shared global configuration to all children */
887	if (config_setconfig(env) == -1)
888		return (-1);
889
890	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
891		if (vsw->sw_running)
892			continue;
893		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
894			log_warn("%s: failed to create switch %s",
895			    __func__, vsw->sw_name);
896			switch_remove(vsw);
897			return (-1);
898		}
899	}
900
901	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
902		if (vm->vm_disabled) {
903			log_debug("%s: not creating vm %s (disabled)",
904			    __func__,
905			    vm->vm_params.vmc_params.vcp_name);
906			continue;
907		}
908		if (config_setvm(&env->vmd_ps, vm,
909		    -1, vm->vm_params.vmc_owner.uid) == -1)
910			return (-1);
911	}
912
913	return (0);
914}
915
916int
917vmd_reload(unsigned int reset, const char *filename)
918{
919	struct vmd_vm		*vm, *next_vm;
920	struct vmd_switch	*vsw;
921	int			 reload = 0;
922
923	/* Switch back to the default config file */
924	if (filename == NULL || *filename == '\0') {
925		filename = env->vmd_conffile;
926		reload = 1;
927	}
928
929	log_debug("%s: level %d config file %s", __func__, reset, filename);
930
931	if (reset) {
932		/* Purge the configuration */
933		config_purge(env, reset);
934		config_setreset(env, reset);
935	} else {
936		/*
937		 * Load or reload the configuration.
938		 *
939		 * Reloading removes all non-running VMs before processing the
940		 * config file, whereas loading only adds to the existing list
941		 * of VMs.
942		 */
943
944		if (reload) {
945			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
946			    next_vm) {
947				if (vm->vm_running == 0) {
948					DPRINTF("%s: calling vm_remove",
949					    __func__);
950					vm_remove(vm, __func__);
951				}
952			}
953		}
954
955		if (parse_config(filename) == -1) {
956			log_debug("%s: failed to load config file %s",
957			    __func__, filename);
958			return (-1);
959		}
960
961		if (reload) {
962			/* Update shared global configuration in all children */
963			if (config_setconfig(env) == -1)
964				return (-1);
965		}
966
967		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
968			if (vsw->sw_running)
969				continue;
970			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
971				log_warn("%s: failed to create switch %s",
972				    __func__, vsw->sw_name);
973				switch_remove(vsw);
974				return (-1);
975			}
976		}
977
978		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
979			if (vm->vm_running == 0) {
980				if (vm->vm_disabled) {
981					log_debug("%s: not creating vm %s"
982					    " (disabled)", __func__,
983					    vm->vm_params.vmc_params.vcp_name);
984					continue;
985				}
986				if (config_setvm(&env->vmd_ps, vm,
987				    -1, vm->vm_params.vmc_owner.uid) == -1)
988					return (-1);
989			} else {
990				log_debug("%s: not creating vm \"%s\": "
991				    "(running)", __func__,
992				    vm->vm_params.vmc_params.vcp_name);
993			}
994		}
995	}
996
997	return (0);
998}
999
1000void
1001vmd_shutdown(void)
1002{
1003	struct vmd_vm *vm, *vm_next;
1004
1005	log_debug("%s: performing shutdown", __func__);
1006
1007	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
1008		vm_remove(vm, __func__);
1009	}
1010
1011	proc_kill(&env->vmd_ps);
1012	free(env);
1013
1014	log_warnx("parent terminating");
1015	exit(0);
1016}
1017
1018struct vmd_vm *
1019vm_getbyvmid(uint32_t vmid)
1020{
1021	struct vmd_vm	*vm;
1022
1023	if (vmid == 0)
1024		return (NULL);
1025	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1026		if (vm->vm_vmid == vmid)
1027			return (vm);
1028	}
1029
1030	return (NULL);
1031}
1032
1033struct vmd_vm *
1034vm_getbyid(uint32_t id)
1035{
1036	struct vmd_vm	*vm;
1037
1038	if (id == 0)
1039		return (NULL);
1040	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1041		if (vm->vm_params.vmc_params.vcp_id == id)
1042			return (vm);
1043	}
1044
1045	return (NULL);
1046}
1047
1048uint32_t
1049vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1050{
1051	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1052		return (0);
1053	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1054	    id, vm->vm_vmid);
1055	return (vm->vm_vmid);
1056}
1057
1058uint32_t
1059vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1060{
1061	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1062		return (0);
1063	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1064	    vmid, vm->vm_params.vmc_params.vcp_id);
1065	return (vm->vm_params.vmc_params.vcp_id);
1066}
1067
1068struct vmd_vm *
1069vm_getbyname(const char *name)
1070{
1071	struct vmd_vm	*vm;
1072
1073	if (name == NULL)
1074		return (NULL);
1075	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1076		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1077			return (vm);
1078	}
1079
1080	return (NULL);
1081}
1082
1083struct vmd_vm *
1084vm_getbypid(pid_t pid)
1085{
1086	struct vmd_vm	*vm;
1087
1088	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1089		if (vm->vm_pid == pid)
1090			return (vm);
1091	}
1092
1093	return (NULL);
1094}
1095
1096void
1097vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1098{
1099	struct privsep	*ps = &env->vmd_ps;
1100	unsigned int	 i, j;
1101
1102	if (vm == NULL)
1103		return;
1104
1105	log_debug("%s: %s %s stopping vm %d%s",
1106	    __func__, ps->ps_title[privsep_process], caller,
1107	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1108
1109	vm->vm_running = 0;
1110	vm->vm_shutdown = 0;
1111
1112	user_inc(&vm->vm_params.vmc_params, vm->vm_user, 0);
1113	user_put(vm->vm_user);
1114
1115	if (vm->vm_iev.ibuf.fd != -1) {
1116		event_del(&vm->vm_iev.ev);
1117		close(vm->vm_iev.ibuf.fd);
1118	}
1119	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
1120		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
1121			if (vm->vm_disks[i][j] != -1) {
1122				close(vm->vm_disks[i][j]);
1123				vm->vm_disks[i][j] = -1;
1124			}
1125		}
1126	}
1127	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
1128		if (vm->vm_ifs[i].vif_fd != -1) {
1129			close(vm->vm_ifs[i].vif_fd);
1130			vm->vm_ifs[i].vif_fd = -1;
1131		}
1132		free(vm->vm_ifs[i].vif_name);
1133		free(vm->vm_ifs[i].vif_switch);
1134		free(vm->vm_ifs[i].vif_group);
1135		vm->vm_ifs[i].vif_name = NULL;
1136		vm->vm_ifs[i].vif_switch = NULL;
1137		vm->vm_ifs[i].vif_group = NULL;
1138	}
1139	if (vm->vm_kernel != -1) {
1140		close(vm->vm_kernel);
1141		vm->vm_kernel = -1;
1142	}
1143	if (vm->vm_cdrom != -1) {
1144		close(vm->vm_cdrom);
1145		vm->vm_cdrom = -1;
1146	}
1147	if (!keeptty) {
1148		vm_closetty(vm);
1149		vm->vm_uid = 0;
1150	}
1151}
1152
1153void
1154vm_remove(struct vmd_vm *vm, const char *caller)
1155{
1156	struct privsep	*ps = &env->vmd_ps;
1157
1158	if (vm == NULL)
1159		return;
1160
1161	log_debug("%s: %s %s removing vm %d from running config",
1162	    __func__, ps->ps_title[privsep_process], caller,
1163	    vm->vm_vmid);
1164
1165	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1166
1167	user_put(vm->vm_user);
1168	vm_stop(vm, 0, caller);
1169	free(vm);
1170}
1171
1172int
1173vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1174    struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1175{
1176	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1177	struct vm_create_params	*vcp = &vmc->vmc_params;
1178	struct vmop_owner	*vmo = NULL;
1179	struct vmd_user		*usr = NULL;
1180	uint32_t		 rng;
1181	unsigned int		 i, j;
1182	struct vmd_switch	*sw;
1183	char			*s;
1184
1185	/* Check if this is an instance of another VM */
1186	if (vm_instance(ps, &vm_parent, vmc, uid) == -1)
1187		return (-1);
1188
1189	errno = 0;
1190	*ret_vm = NULL;
1191
1192	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1193	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1194		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1195		    uid) != 0) {
1196			errno = EPERM;
1197			goto fail;
1198		}
1199		*ret_vm = vm;
1200		errno = EALREADY;
1201		goto fail;
1202	}
1203
1204	if (vm_parent != NULL)
1205		vmo = &vm_parent->vm_params.vmc_insowner;
1206
1207	/* non-root users can only start existing VMs or instances */
1208	if (vm_checkperm(NULL, vmo, uid) != 0) {
1209		log_warnx("permission denied");
1210		errno = EPERM;
1211		goto fail;
1212	}
1213	if (vmc->vmc_flags == 0) {
1214		log_warnx("invalid configuration, no devices");
1215		errno = VMD_DISK_MISSING;
1216		goto fail;
1217	}
1218	if (vcp->vcp_ncpus == 0)
1219		vcp->vcp_ncpus = 1;
1220	if (vcp->vcp_memranges[0].vmr_size == 0)
1221		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1222	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1223		log_warnx("invalid number of CPUs");
1224		goto fail;
1225	} else if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) {
1226		log_warnx("invalid number of disks");
1227		goto fail;
1228	} else if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) {
1229		log_warnx("invalid number of interfaces");
1230		goto fail;
1231	} else if (strlen(vcp->vcp_kernel) == 0 &&
1232	    vcp->vcp_ndisks == 0 && strlen(vcp->vcp_cdrom) == 0) {
1233		log_warnx("no kernel or disk/cdrom specified");
1234		goto fail;
1235	} else if (strlen(vcp->vcp_name) == 0) {
1236		log_warnx("invalid VM name");
1237		goto fail;
1238	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1239	    *vcp->vcp_name == '_') {
1240		log_warnx("invalid VM name");
1241		goto fail;
1242	} else {
1243		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1244			if (!(isalnum(*s) || *s == '.' || *s == '-' ||
1245			    *s == '_')) {
1246				log_warnx("invalid VM name");
1247				goto fail;
1248			}
1249		}
1250	}
1251
1252	/* track active users */
1253	if (uid != 0 && env->vmd_users != NULL &&
1254	    (usr = user_get(uid)) == NULL) {
1255		log_warnx("could not add user");
1256		goto fail;
1257	}
1258
1259	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1260		goto fail;
1261
1262	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1263	vmc = &vm->vm_params;
1264	vcp = &vmc->vmc_params;
1265	vm->vm_pid = -1;
1266	vm->vm_tty = -1;
1267	vm->vm_receive_fd = -1;
1268	vm->vm_paused = 0;
1269	vm->vm_user = usr;
1270
1271	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
1272		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
1273			vm->vm_disks[i][j] = -1;
1274	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
1275		vm->vm_ifs[i].vif_fd = -1;
1276	for (i = 0; i < vcp->vcp_nnics; i++) {
1277		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1278			/* inherit per-interface flags from the switch */
1279			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1280		}
1281
1282		/*
1283		 * If the MAC address is zero, always randomize it in vmd(8)
1284		 * because we cannot rely on the guest OS to do the right
1285		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1286		 * from the kernel, incremented by one to differentiate
1287		 * the source.
1288		 */
1289		if (memcmp(zero_mac, &vcp->vcp_macs[i], ETHER_ADDR_LEN) == 0) {
1290			rng = arc4random();
1291			vcp->vcp_macs[i][0] = 0xfe;
1292			vcp->vcp_macs[i][1] = 0xe1;
1293			vcp->vcp_macs[i][2] = 0xba + 1;
1294			vcp->vcp_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1295			vcp->vcp_macs[i][4] = rng;
1296			vcp->vcp_macs[i][5] = rng >> 8;
1297		}
1298	}
1299	vm->vm_kernel = -1;
1300	vm->vm_cdrom = -1;
1301	vm->vm_iev.ibuf.fd = -1;
1302
1303	if (++env->vmd_nvm == 0)
1304		fatalx("too many vms");
1305
1306	/* Assign a new internal Id if not specified */
1307	vm->vm_vmid = id == 0 ? env->vmd_nvm : id;
1308
1309	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1310	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1311
1312	*ret_vm = vm;
1313	return (0);
1314 fail:
1315	if (errno == 0)
1316		errno = EINVAL;
1317	return (-1);
1318}
1319
1320int
1321vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1322    struct vmop_create_params *vmc, uid_t uid)
1323{
1324	char			*name;
1325	struct vm_create_params	*vcp = &vmc->vmc_params;
1326	struct vmop_create_params *vmcp;
1327	struct vm_create_params	*vcpp;
1328	struct vmd_vm		*vm = NULL;
1329	unsigned int		 i, j;
1330	uint32_t		 id;
1331
1332	/* return without error if the parent is NULL (nothing to inherit) */
1333	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1334	    (*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL)
1335		return (0);
1336
1337	errno = 0;
1338	vmcp = &(*vm_parent)->vm_params;
1339	vcpp = &vmcp->vmc_params;
1340
1341	/* Are we allowed to create an instance from this VM? */
1342	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1343		log_warnx("vm \"%s\" no permission to create vm instance",
1344		    vcpp->vcp_name);
1345		errno = ENAMETOOLONG;
1346		return (-1);
1347	}
1348
1349	id = vcp->vcp_id;
1350	name = vcp->vcp_name;
1351
1352	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1353	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1354		errno = EPROCLIM;
1355		return (-1);
1356	}
1357
1358	/* CPU */
1359	if (vcp->vcp_ncpus == 0)
1360		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1361	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1362	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1363		log_warnx("vm \"%s\" no permission to set cpus", name);
1364		errno = EPERM;
1365		return (-1);
1366	}
1367
1368	/* memory */
1369	if (vcp->vcp_memranges[0].vmr_size == 0)
1370		vcp->vcp_memranges[0].vmr_size =
1371		    vcpp->vcp_memranges[0].vmr_size;
1372	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1373	    vcp->vcp_memranges[0].vmr_size !=
1374	    vcpp->vcp_memranges[0].vmr_size) {
1375		log_warnx("vm \"%s\" no permission to set memory", name);
1376		errno = EPERM;
1377		return (-1);
1378	}
1379
1380	/* disks cannot be inherited */
1381	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1382	    vcp->vcp_ndisks) {
1383		log_warnx("vm \"%s\" no permission to set disks", name);
1384		errno = EPERM;
1385		return (-1);
1386	}
1387	for (i = 0; i < vcp->vcp_ndisks; i++) {
1388		/* Check if this disk is already used in the parent */
1389		for (j = 0; j < vcpp->vcp_ndisks; j++) {
1390			if (strcmp(vcp->vcp_disks[i],
1391			    vcpp->vcp_disks[j]) == 0) {
1392				log_warnx("vm \"%s\" disk %s cannot be reused",
1393				    name, vcp->vcp_disks[i]);
1394				errno = EBUSY;
1395				return (-1);
1396			}
1397		}
1398		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1399	}
1400
1401	/* interfaces */
1402	if (vcp->vcp_nnics > 0 &&
1403	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1404	    vcp->vcp_nnics != vcpp->vcp_nnics) {
1405		log_warnx("vm \"%s\" no permission to set interfaces", name);
1406		errno = EPERM;
1407		return (-1);
1408	}
1409	for (i = 0; i < vcpp->vcp_nnics; i++) {
1410		/* Interface got overwritten */
1411		if (i < vcp->vcp_nnics)
1412			continue;
1413
1414		/* Copy interface from parent */
1415		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1416		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1417		    sizeof(vmc->vmc_ifnames[i]));
1418		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1419		    sizeof(vmc->vmc_ifswitch[i]));
1420		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1421		    sizeof(vmc->vmc_ifgroup[i]));
1422		memcpy(vcp->vcp_macs[i], vcpp->vcp_macs[i],
1423		    sizeof(vcp->vcp_macs[i]));
1424		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1425		vcp->vcp_nnics++;
1426	}
1427	for (i = 0; i < vcp->vcp_nnics; i++) {
1428		for (j = 0; j < vcpp->vcp_nnics; j++) {
1429			if (memcmp(zero_mac, vcp->vcp_macs[i],
1430			    sizeof(vcp->vcp_macs[i])) != 0 &&
1431			    memcmp(vcpp->vcp_macs[i], vcp->vcp_macs[i],
1432			    sizeof(vcp->vcp_macs[i])) != 0) {
1433				log_warnx("vm \"%s\" lladdr cannot be reused",
1434				    name);
1435				errno = EBUSY;
1436				return (-1);
1437			}
1438			if (strlen(vmc->vmc_ifnames[i]) &&
1439			    strcmp(vmc->vmc_ifnames[i],
1440			    vmcp->vmc_ifnames[j]) == 0) {
1441				log_warnx("vm \"%s\" %s cannot be reused",
1442				    vmc->vmc_ifnames[i], name);
1443				errno = EBUSY;
1444				return (-1);
1445			}
1446		}
1447	}
1448
1449	/* kernel */
1450	if (strlen(vcp->vcp_kernel) > 0) {
1451		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1452			log_warnx("vm \"%s\" no permission to set boot image",
1453			    name);
1454			errno = EPERM;
1455			return (-1);
1456		}
1457		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1458	} else if (strlcpy(vcp->vcp_kernel, vcpp->vcp_kernel,
1459	    sizeof(vcp->vcp_kernel)) >= sizeof(vcp->vcp_kernel)) {
1460		log_warnx("vm \"%s\" kernel name too long", name);
1461		errno = EINVAL;
1462		return (-1);
1463	}
1464
1465	/* cdrom */
1466	if (strlen(vcp->vcp_cdrom) > 0) {
1467		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1468			log_warnx("vm \"%s\" no permission to set cdrom", name);
1469			errno = EPERM;
1470			return (-1);
1471		}
1472		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1473	} else if (strlcpy(vcp->vcp_cdrom, vcpp->vcp_cdrom,
1474	    sizeof(vcp->vcp_cdrom)) >= sizeof(vcp->vcp_cdrom)) {
1475		log_warnx("vm \"%s\" cdrom name too long", name);
1476		errno = EINVAL;
1477		return (-1);
1478	}
1479
1480	/* user */
1481	if (vmc->vmc_owner.uid == 0)
1482		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1483	else if (vmc->vmc_owner.uid != uid &&
1484	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1485		log_warnx("vm \"%s\" user mismatch", name);
1486		errno = EPERM;
1487		return (-1);
1488	}
1489
1490	/* group */
1491	if (vmc->vmc_owner.gid == 0)
1492		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1493	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1494		log_warnx("vm \"%s\" group mismatch", name);
1495		errno = EPERM;
1496		return (-1);
1497	}
1498
1499	/* child instances */
1500	if (vmc->vmc_insflags) {
1501		log_warnx("vm \"%s\" cannot change instance permissions", name);
1502		errno = EPERM;
1503		return (-1);
1504	}
1505	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1506		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1507		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1508		vmc->vmc_insflags = vmcp->vmc_insflags;
1509	} else {
1510		vmc->vmc_insowner.gid = 0;
1511		vmc->vmc_insowner.uid = 0;
1512		vmc->vmc_insflags = 0;
1513	}
1514
1515	/* finished, remove instance flags */
1516	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1517
1518	return (0);
1519}
1520
1521/*
1522 * vm_checkperm
1523 *
1524 * Checks if the user represented by the 'uid' parameter is allowed to
1525 * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1526 * console.)
1527 *
1528 * Parameters:
1529 *  vm: the VM whose permission is to be checked
1530 *  vmo: the required uid/gid to be checked
1531 *  uid: the user ID of the user making the request
1532 *
1533 * Return values:
1534 *   0: the permission should be granted
1535 *  -1: the permission check failed (also returned if vm == null)
1536 */
1537int
1538vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1539{
1540	struct group	*gr;
1541	struct passwd	*pw;
1542	char		**grmem;
1543
1544	/* root has no restrictions */
1545	if (uid == 0)
1546		return (0);
1547
1548	if (vmo == NULL)
1549		return (-1);
1550
1551	/* check user */
1552	if (vm == NULL) {
1553		if  (vmo->uid == uid)
1554			return (0);
1555	} else {
1556		/*
1557		 * check user of running vm (the owner of a running vm can
1558		 * be different to (or more specific than) the configured owner.
1559		 */
1560		if ((vm->vm_running && vm->vm_uid == uid) ||
1561		    (!vm->vm_running && vmo->uid == uid))
1562			return (0);
1563	}
1564
1565	/* check groups */
1566	if (vmo->gid != -1) {
1567		if ((pw = getpwuid(uid)) == NULL)
1568			return (-1);
1569		if (pw->pw_gid == vmo->gid)
1570			return (0);
1571		if ((gr = getgrgid(vmo->gid)) != NULL) {
1572			for (grmem = gr->gr_mem; *grmem; grmem++)
1573				if (strcmp(*grmem, pw->pw_name) == 0)
1574					return (0);
1575		}
1576	}
1577
1578	return (-1);
1579}
1580
1581/*
1582 * vm_checkinsflag
1583 *
1584 * Checks wheter the non-root user is allowed to set an instance option.
1585 *
1586 * Parameters:
1587 *  vmc: the VM create parameters
1588 *  flag: the flag to be checked
1589 *  uid: the user ID of the user making the request
1590 *
1591 * Return values:
1592 *   0: the permission should be granted
1593 *  -1: the permission check failed (also returned if vm == null)
1594 */
1595int
1596vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1597{
1598	/* root has no restrictions */
1599	if (uid == 0)
1600		return (0);
1601
1602	if ((vmc->vmc_insflags & flag) == 0)
1603		return (-1);
1604
1605	return (0);
1606}
1607
1608/*
1609 * vm_checkaccess
1610 *
1611 * Checks if the user represented by the 'uid' parameter is allowed to
1612 * access the file described by the 'path' parameter.
1613 *
1614 * Parameters:
1615 *  fd: the file descriptor of the opened file
1616 *  uflag: check if the userid has access to the file
1617 *  uid: the user ID of the user making the request
1618 *  amode: the access flags of R_OK and W_OK
1619 *
1620 * Return values:
1621 *   0: the permission should be granted
1622 *  -1: the permission check failed
1623 */
1624int
1625vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1626{
1627	struct group	*gr;
1628	struct passwd	*pw;
1629	char		**grmem;
1630	struct stat	 st;
1631	mode_t		 mode;
1632
1633	if (fd == -1)
1634		return (-1);
1635
1636	/*
1637	 * File has to be accessible and a regular file
1638	 */
1639	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1640		return (-1);
1641
1642	/* root has no restrictions */
1643	if (uid == 0 || uflag == 0)
1644		return (0);
1645
1646	/* check other */
1647	mode = amode & W_OK ? S_IWOTH : 0;
1648	mode |= amode & R_OK ? S_IROTH : 0;
1649	if ((st.st_mode & mode) == mode)
1650		return (0);
1651
1652	/* check user */
1653	mode = amode & W_OK ? S_IWUSR : 0;
1654	mode |= amode & R_OK ? S_IRUSR : 0;
1655	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1656		return (0);
1657
1658	/* check groups */
1659	mode = amode & W_OK ? S_IWGRP : 0;
1660	mode |= amode & R_OK ? S_IRGRP : 0;
1661	if ((st.st_mode & mode) != mode)
1662		return (-1);
1663	if ((pw = getpwuid(uid)) == NULL)
1664		return (-1);
1665	if (pw->pw_gid == st.st_gid)
1666		return (0);
1667	if ((gr = getgrgid(st.st_gid)) != NULL) {
1668		for (grmem = gr->gr_mem; *grmem; grmem++)
1669			if (strcmp(*grmem, pw->pw_name) == 0)
1670				return (0);
1671	}
1672
1673	return (-1);
1674}
1675
1676int
1677vm_opentty(struct vmd_vm *vm)
1678{
1679	struct ptmget		 ptm;
1680	struct stat		 st;
1681	struct group		*gr;
1682	uid_t			 uid;
1683	gid_t			 gid;
1684	mode_t			 mode;
1685	int			 on;
1686
1687	/*
1688	 * Open tty with pre-opened PTM fd
1689	 */
1690	if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1))
1691		return (-1);
1692
1693	/*
1694	 * We use user ioctl(2) mode to pass break commands.
1695	 */
1696	on = 1;
1697	if (ioctl(ptm.cfd, TIOCUCNTL, &on))
1698		fatal("could not enable user ioctl mode");
1699
1700	vm->vm_tty = ptm.cfd;
1701	close(ptm.sfd);
1702	if ((vm->vm_ttyname = strdup(ptm.sn)) == NULL)
1703		goto fail;
1704
1705	uid = vm->vm_uid;
1706	gid = vm->vm_params.vmc_owner.gid;
1707
1708	if (vm->vm_params.vmc_owner.gid != -1) {
1709		mode = 0660;
1710	} else if ((gr = getgrnam("tty")) != NULL) {
1711		gid = gr->gr_gid;
1712		mode = 0620;
1713	} else {
1714		mode = 0600;
1715		gid = 0;
1716	}
1717
1718	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1719	    __func__, vm->vm_params.vmc_params.vcp_name,
1720	    vm->vm_ttyname, uid, gid, mode);
1721
1722	/*
1723	 * Change ownership and mode of the tty as required.
1724	 * Loosely based on the implementation of sshpty.c
1725	 */
1726	if (stat(vm->vm_ttyname, &st) == -1)
1727		goto fail;
1728
1729	if (st.st_uid != uid || st.st_gid != gid) {
1730		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1731			log_warn("chown %s %d %d failed, uid %d",
1732			    vm->vm_ttyname, uid, gid, getuid());
1733
1734			/* Ignore failure on read-only filesystems */
1735			if (!((errno == EROFS) &&
1736			    (st.st_uid == uid || st.st_uid == 0)))
1737				goto fail;
1738		}
1739	}
1740
1741	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1742		if (chmod(vm->vm_ttyname, mode) == -1) {
1743			log_warn("chmod %s %o failed, uid %d",
1744			    vm->vm_ttyname, mode, getuid());
1745
1746			/* Ignore failure on read-only filesystems */
1747			if (!((errno == EROFS) &&
1748			    (st.st_uid == uid || st.st_uid == 0)))
1749				goto fail;
1750		}
1751	}
1752
1753	return (0);
1754 fail:
1755	vm_closetty(vm);
1756	return (-1);
1757}
1758
1759void
1760vm_closetty(struct vmd_vm *vm)
1761{
1762	if (vm->vm_tty != -1) {
1763		/* Release and close the tty */
1764		if (fchown(vm->vm_tty, 0, 0) == -1)
1765			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1766		if (fchmod(vm->vm_tty, 0666) == -1)
1767			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1768		close(vm->vm_tty);
1769		vm->vm_tty = -1;
1770	}
1771	free(vm->vm_ttyname);
1772	vm->vm_ttyname = NULL;
1773}
1774
1775void
1776switch_remove(struct vmd_switch *vsw)
1777{
1778	if (vsw == NULL)
1779		return;
1780
1781	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1782
1783	free(vsw->sw_group);
1784	free(vsw->sw_name);
1785	free(vsw);
1786}
1787
1788struct vmd_switch *
1789switch_getbyname(const char *name)
1790{
1791	struct vmd_switch	*vsw;
1792
1793	if (name == NULL)
1794		return (NULL);
1795	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1796		if (strcmp(vsw->sw_name, name) == 0)
1797			return (vsw);
1798	}
1799
1800	return (NULL);
1801}
1802
1803struct vmd_user *
1804user_get(uid_t uid)
1805{
1806	struct vmd_user		*usr;
1807
1808	if (uid == 0)
1809		return (NULL);
1810
1811	/* first try to find an existing user */
1812	TAILQ_FOREACH(usr, env->vmd_users, usr_entry) {
1813		if (usr->usr_id.uid == uid)
1814			goto done;
1815	}
1816
1817	if ((usr = calloc(1, sizeof(*usr))) == NULL) {
1818		log_warn("could not allocate user");
1819		return (NULL);
1820	}
1821
1822	usr->usr_id.uid = uid;
1823	usr->usr_id.gid = -1;
1824	TAILQ_INSERT_TAIL(env->vmd_users, usr, usr_entry);
1825
1826 done:
1827	DPRINTF("%s: uid %d #%d +",
1828	    __func__, usr->usr_id.uid, usr->usr_refcnt + 1);
1829	usr->usr_refcnt++;
1830
1831	return (usr);
1832}
1833
1834void
1835user_put(struct vmd_user *usr)
1836{
1837	if (usr == NULL)
1838		return;
1839
1840	DPRINTF("%s: uid %d #%d -",
1841	    __func__, usr->usr_id.uid, usr->usr_refcnt - 1);
1842
1843	if (--usr->usr_refcnt > 0)
1844		return;
1845
1846	TAILQ_REMOVE(env->vmd_users, usr, usr_entry);
1847	free(usr);
1848}
1849
1850void
1851user_inc(struct vm_create_params *vcp, struct vmd_user *usr, int inc)
1852{
1853	char	 mem[FMT_SCALED_STRSIZE];
1854
1855	if (usr == NULL)
1856		return;
1857
1858	/* increment or decrement counters */
1859	inc = inc ? 1 : -1;
1860
1861	usr->usr_maxcpu += vcp->vcp_ncpus * inc;
1862	usr->usr_maxmem += vcp->vcp_memranges[0].vmr_size * inc;
1863	usr->usr_maxifs += vcp->vcp_nnics * inc;
1864
1865	if (log_getverbose() > 1) {
1866		(void)fmt_scaled(usr->usr_maxmem * 1024 * 1024, mem);
1867		log_debug("%s: %c uid %d ref %d cpu %llu mem %s ifs %llu",
1868		    __func__, inc == 1 ? '+' : '-',
1869		    usr->usr_id.uid, usr->usr_refcnt,
1870		    usr->usr_maxcpu, mem, usr->usr_maxifs);
1871	}
1872}
1873
1874int
1875user_checklimit(struct vmd_user *usr, struct vm_create_params *vcp)
1876{
1877	const char	*limit = "";
1878
1879	/* XXX make the limits configurable */
1880	if (usr->usr_maxcpu > VM_DEFAULT_USER_MAXCPU) {
1881		limit = "cpu ";
1882		goto fail;
1883	}
1884	if (usr->usr_maxmem > VM_DEFAULT_USER_MAXMEM) {
1885		limit = "memory ";
1886		goto fail;
1887	}
1888	if (usr->usr_maxifs > VM_DEFAULT_USER_MAXIFS) {
1889		limit = "interface ";
1890		goto fail;
1891	}
1892
1893	return (0);
1894
1895 fail:
1896	log_warnx("%s: user %d %slimit reached", vcp->vcp_name,
1897	    usr->usr_id.uid, limit);
1898	return (-1);
1899}
1900
1901char *
1902get_string(uint8_t *ptr, size_t len)
1903{
1904	size_t	 i;
1905
1906	for (i = 0; i < len; i++)
1907		if (!isprint(ptr[i]))
1908			break;
1909
1910	return strndup(ptr, i);
1911}
1912
1913uint32_t
1914prefixlen2mask(uint8_t prefixlen)
1915{
1916	if (prefixlen == 0)
1917		return (0);
1918
1919	if (prefixlen > 32)
1920		prefixlen = 32;
1921
1922	return (htonl(0xffffffff << (32 - prefixlen)));
1923}
1924
1925void
1926prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask)
1927{
1928	struct in6_addr	 s6;
1929	int		 i;
1930
1931	if (prefixlen > 128)
1932		prefixlen = 128;
1933
1934	memset(&s6, 0, sizeof(s6));
1935	for (i = 0; i < prefixlen / 8; i++)
1936		s6.s6_addr[i] = 0xff;
1937	i = prefixlen % 8;
1938	if (i)
1939		s6.s6_addr[prefixlen / 8] = 0xff00 >> i;
1940
1941	memcpy(mask, &s6, sizeof(s6));
1942}
1943
1944void
1945getmonotime(struct timeval *tv)
1946{
1947	struct timespec	 ts;
1948
1949	if (clock_gettime(CLOCK_MONOTONIC, &ts))
1950		fatal("clock_gettime");
1951
1952	TIMESPEC_TO_TIMEVAL(tv, &ts);
1953}
1954