vmd.c revision 1.105
1/* $OpenBSD: vmd.c,v 1.105 2018/11/21 12:31:47 reyk Exp $ */ 2 3/* 4 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19#include <sys/param.h> /* nitems */ 20#include <sys/queue.h> 21#include <sys/wait.h> 22#include <sys/cdefs.h> 23#include <sys/stat.h> 24#include <sys/tty.h> 25#include <sys/ttycom.h> 26#include <sys/ioctl.h> 27 28#include <stdio.h> 29#include <stdlib.h> 30#include <string.h> 31#include <termios.h> 32#include <errno.h> 33#include <event.h> 34#include <fcntl.h> 35#include <pwd.h> 36#include <signal.h> 37#include <syslog.h> 38#include <unistd.h> 39#include <util.h> 40#include <ctype.h> 41#include <pwd.h> 42#include <grp.h> 43 44#include <machine/specialreg.h> 45#include <machine/vmmvar.h> 46 47#include "proc.h" 48#include "atomicio.h" 49#include "vmd.h" 50 51__dead void usage(void); 52 53int main(int, char **); 54int vmd_configure(void); 55void vmd_sighdlr(int sig, short event, void *arg); 56void vmd_shutdown(void); 57int vmd_control_run(void); 58int vmd_dispatch_control(int, struct privsep_proc *, struct imsg *); 59int vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *); 60int vmd_check_vmh(struct vm_dump_header *); 61 62int vm_instance(struct privsep *, struct vmd_vm **, 63 struct vmop_create_params *, uid_t); 64int vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t); 65 66struct vmd *env; 67 68static struct privsep_proc procs[] = { 69 /* Keep "priv" on top as procs[0] */ 70 { "priv", PROC_PRIV, NULL, priv }, 71 { "control", PROC_CONTROL, vmd_dispatch_control, control }, 72 { "vmm", PROC_VMM, vmd_dispatch_vmm, vmm, vmm_shutdown }, 73}; 74 75/* For the privileged process */ 76static struct privsep_proc *proc_priv = &procs[0]; 77static struct passwd proc_privpw; 78static const uint8_t zero_mac[ETHER_ADDR_LEN]; 79 80int 81vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg) 82{ 83 struct privsep *ps = p->p_ps; 84 int res = 0, ret = 0, cmd = 0, verbose; 85 unsigned int v = 0, flags; 86 struct vmop_create_params vmc; 87 struct vmop_id vid; 88 struct vmop_result vmr; 89 struct vm_dump_header vmh; 90 struct vmd_vm *vm = NULL; 91 char *str = NULL; 92 uint32_t id = 0; 93 struct control_sock *rcs; 94 95 switch (imsg->hdr.type) { 96 case IMSG_VMDOP_START_VM_REQUEST: 97 IMSG_SIZE_CHECK(imsg, &vmc); 98 memcpy(&vmc, imsg->data, sizeof(vmc)); 99 ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid); 100 if (vmc.vmc_flags == 0) { 101 /* start an existing VM with pre-configured options */ 102 if (!(ret == -1 && errno == EALREADY && 103 vm->vm_running == 0)) { 104 res = errno; 105 cmd = IMSG_VMDOP_START_VM_RESPONSE; 106 } 107 } else if (ret != 0) { 108 res = errno; 109 cmd = IMSG_VMDOP_START_VM_RESPONSE; 110 } 111 if (res == 0 && 112 config_setvm(ps, vm, 113 imsg->hdr.peerid, vm->vm_params.vmc_owner.uid) == -1) { 114 res = errno; 115 cmd = IMSG_VMDOP_START_VM_RESPONSE; 116 } 117 break; 118 case IMSG_VMDOP_TERMINATE_VM_REQUEST: 119 IMSG_SIZE_CHECK(imsg, &vid); 120 memcpy(&vid, imsg->data, sizeof(vid)); 121 flags = vid.vid_flags; 122 123 if ((id = vid.vid_id) == 0) { 124 /* Lookup vm (id) by name */ 125 if ((vm = vm_getbyname(vid.vid_name)) == NULL) { 126 res = ENOENT; 127 cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; 128 break; 129 } else if (vm->vm_shutdown && 130 (flags & VMOP_FORCE) == 0) { 131 res = EALREADY; 132 cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; 133 break; 134 } else if (vm->vm_running == 0) { 135 res = EINVAL; 136 cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; 137 break; 138 } 139 id = vm->vm_vmid; 140 } else if ((vm = vm_getbyvmid(id)) == NULL) { 141 res = ENOENT; 142 cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; 143 break; 144 } 145 if (vm_checkperm(vm, &vm->vm_params.vmc_owner, 146 vid.vid_uid) != 0) { 147 res = EPERM; 148 cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; 149 break; 150 } 151 152 memset(&vid, 0, sizeof(vid)); 153 vid.vid_id = id; 154 vid.vid_flags = flags; 155 if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type, 156 imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1) 157 return (-1); 158 break; 159 case IMSG_VMDOP_GET_INFO_VM_REQUEST: 160 proc_forward_imsg(ps, imsg, PROC_VMM, -1); 161 break; 162 case IMSG_VMDOP_LOAD: 163 IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */ 164 str = get_string((uint8_t *)imsg->data, 165 IMSG_DATA_SIZE(imsg)); 166 case IMSG_VMDOP_RELOAD: 167 if (vmd_reload(0, str) == -1) 168 cmd = IMSG_CTL_FAIL; 169 else 170 cmd = IMSG_CTL_OK; 171 free(str); 172 break; 173 case IMSG_CTL_RESET: 174 IMSG_SIZE_CHECK(imsg, &v); 175 memcpy(&v, imsg->data, sizeof(v)); 176 if (vmd_reload(v, NULL) == -1) 177 cmd = IMSG_CTL_FAIL; 178 else 179 cmd = IMSG_CTL_OK; 180 break; 181 case IMSG_CTL_VERBOSE: 182 IMSG_SIZE_CHECK(imsg, &verbose); 183 memcpy(&verbose, imsg->data, sizeof(verbose)); 184 log_setverbose(verbose); 185 186 proc_forward_imsg(ps, imsg, PROC_VMM, -1); 187 proc_forward_imsg(ps, imsg, PROC_PRIV, -1); 188 cmd = IMSG_CTL_OK; 189 break; 190 case IMSG_VMDOP_PAUSE_VM: 191 case IMSG_VMDOP_UNPAUSE_VM: 192 IMSG_SIZE_CHECK(imsg, &vid); 193 memcpy(&vid, imsg->data, sizeof(vid)); 194 if (vid.vid_id == 0) { 195 if ((vm = vm_getbyname(vid.vid_name)) == NULL) { 196 res = ENOENT; 197 cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE; 198 break; 199 } else { 200 vid.vid_id = vm->vm_vmid; 201 } 202 } else if ((vm = vm_getbyid(vid.vid_id)) == NULL) { 203 res = ENOENT; 204 cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE; 205 break; 206 } 207 if (vm_checkperm(vm, &vm->vm_params.vmc_owner, 208 vid.vid_uid) != 0) { 209 res = EPERM; 210 cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE; 211 break; 212 } 213 proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type, 214 imsg->hdr.peerid, -1, &vid, sizeof(vid)); 215 break; 216 case IMSG_VMDOP_SEND_VM_REQUEST: 217 IMSG_SIZE_CHECK(imsg, &vid); 218 memcpy(&vid, imsg->data, sizeof(vid)); 219 id = vid.vid_id; 220 if (vid.vid_id == 0) { 221 if ((vm = vm_getbyname(vid.vid_name)) == NULL) { 222 res = ENOENT; 223 cmd = IMSG_VMDOP_SEND_VM_RESPONSE; 224 close(imsg->fd); 225 break; 226 } else { 227 vid.vid_id = vm->vm_vmid; 228 } 229 } else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) { 230 res = ENOENT; 231 cmd = IMSG_VMDOP_SEND_VM_RESPONSE; 232 close(imsg->fd); 233 break; 234 } else { 235 } 236 vmr.vmr_id = vid.vid_id; 237 log_debug("%s: sending fd to vmm", __func__); 238 proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type, 239 imsg->hdr.peerid, imsg->fd, &vid, sizeof(vid)); 240 break; 241 case IMSG_VMDOP_RECEIVE_VM_REQUEST: 242 IMSG_SIZE_CHECK(imsg, &vid); 243 memcpy(&vid, imsg->data, sizeof(vid)); 244 if (imsg->fd == -1) { 245 log_warnx("%s: invalid fd", __func__); 246 return (-1); 247 } 248 if (atomicio(read, imsg->fd, &vmh, sizeof(vmh)) != 249 sizeof(vmh)) { 250 log_warnx("%s: error reading vmh from received vm", 251 __func__); 252 res = EIO; 253 close(imsg->fd); 254 cmd = IMSG_VMDOP_START_VM_RESPONSE; 255 break; 256 } 257 258 if (vmd_check_vmh(&vmh)) { 259 res = ENOENT; 260 close(imsg->fd); 261 cmd = IMSG_VMDOP_START_VM_RESPONSE; 262 break; 263 } 264 if (atomicio(read, imsg->fd, &vmc, sizeof(vmc)) != 265 sizeof(vmc)) { 266 log_warnx("%s: error reading vmc from received vm", 267 __func__); 268 res = EIO; 269 close(imsg->fd); 270 cmd = IMSG_VMDOP_START_VM_RESPONSE; 271 break; 272 } 273 strlcpy(vmc.vmc_params.vcp_name, vid.vid_name, 274 sizeof(vmc.vmc_params.vcp_name)); 275 vmc.vmc_params.vcp_id = 0; 276 277 ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid); 278 if (ret != 0) { 279 res = errno; 280 cmd = IMSG_VMDOP_START_VM_RESPONSE; 281 close(imsg->fd); 282 } else { 283 vm->vm_received = 1; 284 config_setvm(ps, vm, imsg->hdr.peerid, 285 vmc.vmc_owner.uid); 286 log_debug("%s: sending fd to vmm", __func__); 287 proc_compose_imsg(ps, PROC_VMM, -1, 288 IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, imsg->fd, 289 NULL, 0); 290 } 291 break; 292 case IMSG_VMDOP_DONE: 293 control_reset(&ps->ps_csock); 294 TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry) 295 control_reset(rcs); 296 cmd = 0; 297 break; 298 default: 299 return (-1); 300 } 301 302 switch (cmd) { 303 case 0: 304 break; 305 case IMSG_VMDOP_START_VM_RESPONSE: 306 case IMSG_VMDOP_TERMINATE_VM_RESPONSE: 307 memset(&vmr, 0, sizeof(vmr)); 308 vmr.vmr_result = res; 309 vmr.vmr_id = id; 310 if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd, 311 imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1) 312 return (-1); 313 break; 314 default: 315 if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd, 316 imsg->hdr.peerid, -1, &res, sizeof(res)) == -1) 317 return (-1); 318 break; 319 } 320 321 return (0); 322} 323 324int 325vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg) 326{ 327 struct vmop_result vmr; 328 struct privsep *ps = p->p_ps; 329 int res = 0; 330 struct vmd_vm *vm; 331 struct vm_create_params *vcp; 332 struct vmop_info_result vir; 333 334 switch (imsg->hdr.type) { 335 case IMSG_VMDOP_PAUSE_VM_RESPONSE: 336 IMSG_SIZE_CHECK(imsg, &vmr); 337 memcpy(&vmr, imsg->data, sizeof(vmr)); 338 if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) 339 break; 340 proc_compose_imsg(ps, PROC_CONTROL, -1, 341 imsg->hdr.type, imsg->hdr.peerid, -1, 342 imsg->data, sizeof(imsg->data)); 343 log_info("%s: paused vm %d successfully", 344 vm->vm_params.vmc_params.vcp_name, 345 vm->vm_vmid); 346 break; 347 case IMSG_VMDOP_UNPAUSE_VM_RESPONSE: 348 IMSG_SIZE_CHECK(imsg, &vmr); 349 memcpy(&vmr, imsg->data, sizeof(vmr)); 350 if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) 351 break; 352 proc_compose_imsg(ps, PROC_CONTROL, -1, 353 imsg->hdr.type, imsg->hdr.peerid, -1, 354 imsg->data, sizeof(imsg->data)); 355 log_info("%s: unpaused vm %d successfully.", 356 vm->vm_params.vmc_params.vcp_name, 357 vm->vm_vmid); 358 break; 359 case IMSG_VMDOP_START_VM_RESPONSE: 360 IMSG_SIZE_CHECK(imsg, &vmr); 361 memcpy(&vmr, imsg->data, sizeof(vmr)); 362 if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) 363 break; 364 vm->vm_pid = vmr.vmr_pid; 365 vcp = &vm->vm_params.vmc_params; 366 vcp->vcp_id = vmr.vmr_id; 367 368 /* 369 * If the peerid is not -1, forward the response back to the 370 * the control socket. If it is -1, the request originated 371 * from the parent, not the control socket. 372 */ 373 if (vm->vm_peerid != (uint32_t)-1) { 374 (void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname, 375 sizeof(vmr.vmr_ttyname)); 376 if (proc_compose_imsg(ps, PROC_CONTROL, -1, 377 imsg->hdr.type, vm->vm_peerid, -1, 378 &vmr, sizeof(vmr)) == -1) { 379 errno = vmr.vmr_result; 380 log_warn("%s: failed to foward vm result", 381 vcp->vcp_name); 382 vm_remove(vm, __func__); 383 return (-1); 384 } 385 } 386 387 if (vmr.vmr_result) { 388 errno = vmr.vmr_result; 389 log_warn("%s: failed to start vm", vcp->vcp_name); 390 vm_remove(vm, __func__); 391 break; 392 } 393 394 /* Now configure all the interfaces */ 395 if (vm_priv_ifconfig(ps, vm) == -1) { 396 log_warn("%s: failed to configure vm", vcp->vcp_name); 397 vm_remove(vm, __func__); 398 break; 399 } 400 401 log_info("%s: started vm %d successfully, tty %s", 402 vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname); 403 break; 404 case IMSG_VMDOP_TERMINATE_VM_RESPONSE: 405 IMSG_SIZE_CHECK(imsg, &vmr); 406 memcpy(&vmr, imsg->data, sizeof(vmr)); 407 DPRINTF("%s: forwarding TERMINATE VM for vm id %d", 408 __func__, vmr.vmr_id); 409 proc_forward_imsg(ps, imsg, PROC_CONTROL, -1); 410 if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) 411 break; 412 if (vmr.vmr_result == 0) { 413 /* Mark VM as shutting down */ 414 vm->vm_shutdown = 1; 415 } 416 break; 417 case IMSG_VMDOP_SEND_VM_RESPONSE: 418 IMSG_SIZE_CHECK(imsg, &vmr); 419 memcpy(&vmr, imsg->data, sizeof(vmr)); 420 if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) 421 break; 422 if (!vmr.vmr_result) { 423 log_info("%s: sent vm %d successfully.", 424 vm->vm_params.vmc_params.vcp_name, 425 vm->vm_vmid); 426 if (vm->vm_from_config) 427 vm_stop(vm, 0, __func__); 428 else 429 vm_remove(vm, __func__); 430 } 431 432 /* Send a response if a control client is waiting for it */ 433 if (imsg->hdr.peerid != (uint32_t)-1) { 434 /* the error is meaningless for deferred responses */ 435 vmr.vmr_result = 0; 436 437 if (proc_compose_imsg(ps, PROC_CONTROL, -1, 438 IMSG_VMDOP_SEND_VM_RESPONSE, 439 imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1) 440 return (-1); 441 } 442 break; 443 case IMSG_VMDOP_TERMINATE_VM_EVENT: 444 IMSG_SIZE_CHECK(imsg, &vmr); 445 memcpy(&vmr, imsg->data, sizeof(vmr)); 446 DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d", 447 __func__, vmr.vmr_id, vmr.vmr_result); 448 if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) { 449 log_debug("%s: vm %d is no longer available", 450 __func__, vmr.vmr_id); 451 break; 452 } 453 if (vmr.vmr_result != EAGAIN) { 454 if (vm->vm_from_config) 455 vm_stop(vm, 0, __func__); 456 else 457 vm_remove(vm, __func__); 458 } else { 459 /* Stop VM instance but keep the tty open */ 460 vm_stop(vm, 1, __func__); 461 config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid); 462 } 463 464 /* Send a response if a control client is waiting for it */ 465 if (imsg->hdr.peerid != (uint32_t)-1) { 466 /* the error is meaningless for deferred responses */ 467 vmr.vmr_result = 0; 468 469 if (proc_compose_imsg(ps, PROC_CONTROL, -1, 470 IMSG_VMDOP_TERMINATE_VM_RESPONSE, 471 imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1) 472 return (-1); 473 } 474 break; 475 case IMSG_VMDOP_GET_INFO_VM_DATA: 476 IMSG_SIZE_CHECK(imsg, &vir); 477 memcpy(&vir, imsg->data, sizeof(vir)); 478 if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) { 479 memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname)); 480 if (vm->vm_ttyname != NULL) 481 strlcpy(vir.vir_ttyname, vm->vm_ttyname, 482 sizeof(vir.vir_ttyname)); 483 if (vm->vm_shutdown) { 484 /* XXX there might be a nicer way */ 485 (void)strlcat(vir.vir_info.vir_name, 486 " - stopping", 487 sizeof(vir.vir_info.vir_name)); 488 } 489 /* get the user id who started the vm */ 490 vir.vir_uid = vm->vm_uid; 491 vir.vir_gid = vm->vm_params.vmc_owner.gid; 492 } 493 if (proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type, 494 imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) { 495 log_debug("%s: GET_INFO_VM failed for vm %d, removing", 496 __func__, vm->vm_vmid); 497 vm_remove(vm, __func__); 498 return (-1); 499 } 500 break; 501 case IMSG_VMDOP_GET_INFO_VM_END_DATA: 502 /* 503 * PROC_VMM has responded with the *running* VMs, now we 504 * append the others. These use the special value 0 for their 505 * kernel id to indicate that they are not running. 506 */ 507 TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { 508 if (!vm->vm_running) { 509 memset(&vir, 0, sizeof(vir)); 510 vir.vir_info.vir_id = vm->vm_vmid; 511 strlcpy(vir.vir_info.vir_name, 512 vm->vm_params.vmc_params.vcp_name, 513 VMM_MAX_NAME_LEN); 514 vir.vir_info.vir_memory_size = 515 vm->vm_params.vmc_params. 516 vcp_memranges[0].vmr_size; 517 vir.vir_info.vir_ncpus = 518 vm->vm_params.vmc_params.vcp_ncpus; 519 /* get the configured user id for this vm */ 520 vir.vir_uid = vm->vm_params.vmc_owner.uid; 521 vir.vir_gid = vm->vm_params.vmc_owner.gid; 522 if (proc_compose_imsg(ps, PROC_CONTROL, -1, 523 IMSG_VMDOP_GET_INFO_VM_DATA, 524 imsg->hdr.peerid, -1, &vir, 525 sizeof(vir)) == -1) { 526 log_debug("%s: GET_INFO_VM_END failed", 527 __func__); 528 vm_remove(vm, __func__); 529 return (-1); 530 } 531 } 532 } 533 IMSG_SIZE_CHECK(imsg, &res); 534 proc_forward_imsg(ps, imsg, PROC_CONTROL, -1); 535 break; 536 default: 537 return (-1); 538 } 539 540 return (0); 541} 542 543int 544vmd_check_vmh(struct vm_dump_header *vmh) 545{ 546 int i; 547 unsigned int code, leaf; 548 unsigned int a, b, c, d; 549 550 551 if (vmh->vmh_version != VM_DUMP_VERSION) { 552 log_warnx("%s: incompatible dump version", __func__); 553 return (-1); 554 } 555 556 for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { 557 code = vmh->vmh_cpuids[i].code; 558 leaf = vmh->vmh_cpuids[i].leaf; 559 if (leaf != 0x00) { 560 log_debug("%s: invalid leaf 0x%x for code 0x%x", 561 __func__, leaf, code); 562 return (-1); 563 } 564 565 switch (code) { 566 case 0x00: 567 CPUID_LEAF(code, leaf, a, b, c, d); 568 if (vmh->vmh_cpuids[i].a > a) { 569 log_debug("%s: incompatible cpuid level", 570 __func__); 571 return (-1); 572 } 573 if (!(vmh->vmh_cpuids[i].b == b && 574 vmh->vmh_cpuids[i].c == c && 575 vmh->vmh_cpuids[i].d == d)) { 576 log_debug("%s: incompatible cpu brand", 577 __func__); 578 return (-1); 579 } 580 break; 581 582 case 0x01: 583 CPUID_LEAF(code, leaf, a, b, c, d); 584 if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) != 585 (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) { 586 log_debug("%s: incompatible cpu features " 587 "code: 0x%x leaf: 0x%x reg: c", __func__, 588 code, leaf); 589 return (-1); 590 } 591 if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) != 592 (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) { 593 log_debug("%s: incompatible cpu features " 594 "code: 0x%x leaf: 0x%x reg: d", __func__, 595 code, leaf); 596 return (-1); 597 } 598 break; 599 600 case 0x07: 601 CPUID_LEAF(code, leaf, a, b, c, d); 602 if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) != 603 (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) { 604 log_debug("%s: incompatible cpu features " 605 "code: 0x%x leaf: 0x%x reg: c", __func__, 606 code, leaf); 607 return (-1); 608 } 609 if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) != 610 (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) { 611 log_debug("%s: incompatible cpu features " 612 "code: 0x%x leaf: 0x%x reg: d", __func__, 613 code, leaf); 614 return (-1); 615 } 616 break; 617 618 case 0x0d: 619 CPUID_LEAF(code, leaf, a, b, c, d); 620 if (vmh->vmh_cpuids[i].b > b) { 621 log_debug("%s: incompatible cpu: insufficient " 622 "max save area for enabled XCR0 features", 623 __func__); 624 return (-1); 625 } 626 if (vmh->vmh_cpuids[i].c > c) { 627 log_debug("%s: incompatible cpu: insufficient " 628 "max save area for supported XCR0 features", 629 __func__); 630 return (-1); 631 } 632 break; 633 634 case 0x80000001: 635 CPUID_LEAF(code, leaf, a, b, c, d); 636 if ((vmh->vmh_cpuids[i].a & a) != 637 vmh->vmh_cpuids[i].a) { 638 log_debug("%s: incompatible cpu features " 639 "code: 0x%x leaf: 0x%x reg: a", __func__, 640 code, leaf); 641 return (-1); 642 } 643 if ((vmh->vmh_cpuids[i].c & c) != 644 vmh->vmh_cpuids[i].c) { 645 log_debug("%s: incompatible cpu features " 646 "code: 0x%x leaf: 0x%x reg: c", __func__, 647 code, leaf); 648 return (-1); 649 } 650 if ((vmh->vmh_cpuids[i].d & d) != 651 vmh->vmh_cpuids[i].d) { 652 log_debug("%s: incompatible cpu features " 653 "code: 0x%x leaf: 0x%x reg: d", __func__, 654 code, leaf); 655 return (-1); 656 } 657 break; 658 659 default: 660 log_debug("%s: unknown code 0x%x", __func__, code); 661 return (-1); 662 } 663 } 664 665 return (0); 666} 667 668void 669vmd_sighdlr(int sig, short event, void *arg) 670{ 671 if (privsep_process != PROC_PARENT) 672 return; 673 log_debug("%s: handling signal", __func__); 674 675 switch (sig) { 676 case SIGHUP: 677 log_info("%s: reload requested with SIGHUP", __func__); 678 679 /* 680 * This is safe because libevent uses async signal handlers 681 * that run in the event loop and not in signal context. 682 */ 683 (void)vmd_reload(0, NULL); 684 break; 685 case SIGPIPE: 686 log_info("%s: ignoring SIGPIPE", __func__); 687 break; 688 case SIGUSR1: 689 log_info("%s: ignoring SIGUSR1", __func__); 690 break; 691 case SIGTERM: 692 case SIGINT: 693 vmd_shutdown(); 694 break; 695 default: 696 fatalx("unexpected signal"); 697 } 698} 699 700__dead void 701usage(void) 702{ 703 extern char *__progname; 704 fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n", 705 __progname); 706 exit(1); 707} 708 709int 710main(int argc, char **argv) 711{ 712 struct privsep *ps; 713 int ch; 714 const char *conffile = VMD_CONF; 715 enum privsep_procid proc_id = PROC_PARENT; 716 int proc_instance = 0; 717 const char *errp, *title = NULL; 718 int argc0 = argc; 719 720 log_init(0, LOG_DAEMON); 721 722 if ((env = calloc(1, sizeof(*env))) == NULL) 723 fatal("calloc: env"); 724 725 while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) { 726 switch (ch) { 727 case 'D': 728 if (cmdline_symset(optarg) < 0) 729 log_warnx("could not parse macro definition %s", 730 optarg); 731 break; 732 case 'd': 733 env->vmd_debug = 2; 734 break; 735 case 'f': 736 conffile = optarg; 737 break; 738 case 'v': 739 env->vmd_verbose++; 740 break; 741 case 'n': 742 env->vmd_noaction = 1; 743 break; 744 case 'P': 745 title = optarg; 746 proc_id = proc_getid(procs, nitems(procs), title); 747 if (proc_id == PROC_MAX) 748 fatalx("invalid process name"); 749 break; 750 case 'I': 751 proc_instance = strtonum(optarg, 0, 752 PROC_MAX_INSTANCES, &errp); 753 if (errp) 754 fatalx("invalid process instance"); 755 break; 756 default: 757 usage(); 758 } 759 } 760 761 argc -= optind; 762 if (argc > 0) 763 usage(); 764 765 if (env->vmd_noaction && !env->vmd_debug) 766 env->vmd_debug = 1; 767 768 /* check for root privileges */ 769 if (env->vmd_noaction == 0) { 770 if (geteuid()) 771 fatalx("need root privileges"); 772 } 773 774 ps = &env->vmd_ps; 775 ps->ps_env = env; 776 env->vmd_fd = -1; 777 778 if (config_init(env) == -1) 779 fatal("failed to initialize configuration"); 780 781 if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL) 782 fatal("unknown user %s", VMD_USER); 783 784 /* First proc runs as root without pledge but in default chroot */ 785 proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */ 786 proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */ 787 788 /* Open /dev/vmm */ 789 if (env->vmd_noaction == 0) { 790 env->vmd_fd = open(VMM_NODE, O_RDWR); 791 if (env->vmd_fd == -1) 792 fatal("%s", VMM_NODE); 793 } 794 795 /* Configure the control socket */ 796 ps->ps_csock.cs_name = SOCKET_NAME; 797 TAILQ_INIT(&ps->ps_rcsocks); 798 799 /* Configuration will be parsed after forking the children */ 800 env->vmd_conffile = conffile; 801 802 log_init(env->vmd_debug, LOG_DAEMON); 803 log_setverbose(env->vmd_verbose); 804 805 if (env->vmd_noaction) 806 ps->ps_noaction = 1; 807 ps->ps_instance = proc_instance; 808 if (title != NULL) 809 ps->ps_title[proc_id] = title; 810 811 /* only the parent returns */ 812 proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv, 813 proc_id); 814 815 log_procinit("parent"); 816 if (!env->vmd_debug && daemon(0, 0) == -1) 817 fatal("can't daemonize"); 818 819 if (ps->ps_noaction == 0) 820 log_info("startup"); 821 822 event_init(); 823 824 signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps); 825 signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps); 826 signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps); 827 signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps); 828 signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps); 829 830 signal_add(&ps->ps_evsigint, NULL); 831 signal_add(&ps->ps_evsigterm, NULL); 832 signal_add(&ps->ps_evsighup, NULL); 833 signal_add(&ps->ps_evsigpipe, NULL); 834 signal_add(&ps->ps_evsigusr1, NULL); 835 836 if (!env->vmd_noaction) 837 proc_connect(ps); 838 839 if (vmd_configure() == -1) 840 fatalx("configuration failed"); 841 842 event_dispatch(); 843 844 log_debug("parent exiting"); 845 846 return (0); 847} 848 849int 850vmd_configure(void) 851{ 852 struct vmd_vm *vm; 853 struct vmd_switch *vsw; 854 855 if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1) 856 fatal("open %s", PATH_PTMDEV); 857 858 /* 859 * pledge in the parent process: 860 * stdio - for malloc and basic I/O including events. 861 * rpath - for reload to open and read the configuration files. 862 * wpath - for opening disk images and tap devices. 863 * tty - for openpty and TIOCUCNTL. 864 * proc - run kill to terminate its children safely. 865 * sendfd - for disks, interfaces and other fds. 866 * recvfd - for send and receive. 867 * getpw - lookup user or group id by name. 868 * chown, fattr - change tty ownership 869 * flock - locking disk files 870 */ 871 if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw" 872 " chown fattr flock", NULL) == -1) 873 fatal("pledge"); 874 875 if (parse_config(env->vmd_conffile) == -1) { 876 proc_kill(&env->vmd_ps); 877 exit(1); 878 } 879 880 if (env->vmd_noaction) { 881 fprintf(stderr, "configuration OK\n"); 882 proc_kill(&env->vmd_ps); 883 exit(0); 884 } 885 886 /* Send shared global configuration to all children */ 887 if (config_setconfig(env) == -1) 888 return (-1); 889 890 TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) { 891 if (vsw->sw_running) 892 continue; 893 if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) { 894 log_warn("%s: failed to create switch %s", 895 __func__, vsw->sw_name); 896 switch_remove(vsw); 897 return (-1); 898 } 899 } 900 901 TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { 902 if (vm->vm_disabled) { 903 log_debug("%s: not creating vm %s (disabled)", 904 __func__, 905 vm->vm_params.vmc_params.vcp_name); 906 continue; 907 } 908 if (config_setvm(&env->vmd_ps, vm, 909 -1, vm->vm_params.vmc_owner.uid) == -1) 910 return (-1); 911 } 912 913 return (0); 914} 915 916int 917vmd_reload(unsigned int reset, const char *filename) 918{ 919 struct vmd_vm *vm, *next_vm; 920 struct vmd_switch *vsw; 921 int reload = 0; 922 923 /* Switch back to the default config file */ 924 if (filename == NULL || *filename == '\0') { 925 filename = env->vmd_conffile; 926 reload = 1; 927 } 928 929 log_debug("%s: level %d config file %s", __func__, reset, filename); 930 931 if (reset) { 932 /* Purge the configuration */ 933 config_purge(env, reset); 934 config_setreset(env, reset); 935 } else { 936 /* 937 * Load or reload the configuration. 938 * 939 * Reloading removes all non-running VMs before processing the 940 * config file, whereas loading only adds to the existing list 941 * of VMs. 942 */ 943 944 if (reload) { 945 TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, 946 next_vm) { 947 if (vm->vm_running == 0) { 948 DPRINTF("%s: calling vm_remove", 949 __func__); 950 vm_remove(vm, __func__); 951 } 952 } 953 } 954 955 if (parse_config(filename) == -1) { 956 log_debug("%s: failed to load config file %s", 957 __func__, filename); 958 return (-1); 959 } 960 961 if (reload) { 962 /* Update shared global configuration in all children */ 963 if (config_setconfig(env) == -1) 964 return (-1); 965 } 966 967 TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) { 968 if (vsw->sw_running) 969 continue; 970 if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) { 971 log_warn("%s: failed to create switch %s", 972 __func__, vsw->sw_name); 973 switch_remove(vsw); 974 return (-1); 975 } 976 } 977 978 TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { 979 if (vm->vm_running == 0) { 980 if (vm->vm_disabled) { 981 log_debug("%s: not creating vm %s" 982 " (disabled)", __func__, 983 vm->vm_params.vmc_params.vcp_name); 984 continue; 985 } 986 if (config_setvm(&env->vmd_ps, vm, 987 -1, vm->vm_params.vmc_owner.uid) == -1) 988 return (-1); 989 } else { 990 log_debug("%s: not creating vm \"%s\": " 991 "(running)", __func__, 992 vm->vm_params.vmc_params.vcp_name); 993 } 994 } 995 } 996 997 return (0); 998} 999 1000void 1001vmd_shutdown(void) 1002{ 1003 struct vmd_vm *vm, *vm_next; 1004 1005 log_debug("%s: performing shutdown", __func__); 1006 1007 TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) { 1008 vm_remove(vm, __func__); 1009 } 1010 1011 proc_kill(&env->vmd_ps); 1012 free(env); 1013 1014 log_warnx("parent terminating"); 1015 exit(0); 1016} 1017 1018struct vmd_vm * 1019vm_getbyvmid(uint32_t vmid) 1020{ 1021 struct vmd_vm *vm; 1022 1023 if (vmid == 0) 1024 return (NULL); 1025 TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { 1026 if (vm->vm_vmid == vmid) 1027 return (vm); 1028 } 1029 1030 return (NULL); 1031} 1032 1033struct vmd_vm * 1034vm_getbyid(uint32_t id) 1035{ 1036 struct vmd_vm *vm; 1037 1038 if (id == 0) 1039 return (NULL); 1040 TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { 1041 if (vm->vm_params.vmc_params.vcp_id == id) 1042 return (vm); 1043 } 1044 1045 return (NULL); 1046} 1047 1048uint32_t 1049vm_id2vmid(uint32_t id, struct vmd_vm *vm) 1050{ 1051 if (vm == NULL && (vm = vm_getbyid(id)) == NULL) 1052 return (0); 1053 DPRINTF("%s: vmm id %u is vmid %u", __func__, 1054 id, vm->vm_vmid); 1055 return (vm->vm_vmid); 1056} 1057 1058uint32_t 1059vm_vmid2id(uint32_t vmid, struct vmd_vm *vm) 1060{ 1061 if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL) 1062 return (0); 1063 DPRINTF("%s: vmid %u is vmm id %u", __func__, 1064 vmid, vm->vm_params.vmc_params.vcp_id); 1065 return (vm->vm_params.vmc_params.vcp_id); 1066} 1067 1068struct vmd_vm * 1069vm_getbyname(const char *name) 1070{ 1071 struct vmd_vm *vm; 1072 1073 if (name == NULL) 1074 return (NULL); 1075 TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { 1076 if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0) 1077 return (vm); 1078 } 1079 1080 return (NULL); 1081} 1082 1083struct vmd_vm * 1084vm_getbypid(pid_t pid) 1085{ 1086 struct vmd_vm *vm; 1087 1088 TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { 1089 if (vm->vm_pid == pid) 1090 return (vm); 1091 } 1092 1093 return (NULL); 1094} 1095 1096void 1097vm_stop(struct vmd_vm *vm, int keeptty, const char *caller) 1098{ 1099 struct privsep *ps = &env->vmd_ps; 1100 unsigned int i, j; 1101 1102 if (vm == NULL) 1103 return; 1104 1105 log_debug("%s: %s %s stopping vm %d%s", 1106 __func__, ps->ps_title[privsep_process], caller, 1107 vm->vm_vmid, keeptty ? ", keeping tty open" : ""); 1108 1109 vm->vm_running = 0; 1110 vm->vm_shutdown = 0; 1111 1112 user_inc(&vm->vm_params.vmc_params, vm->vm_user, 0); 1113 user_put(vm->vm_user); 1114 1115 if (vm->vm_iev.ibuf.fd != -1) { 1116 event_del(&vm->vm_iev.ev); 1117 close(vm->vm_iev.ibuf.fd); 1118 } 1119 for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) { 1120 for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) { 1121 if (vm->vm_disks[i][j] != -1) { 1122 close(vm->vm_disks[i][j]); 1123 vm->vm_disks[i][j] = -1; 1124 } 1125 } 1126 } 1127 for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) { 1128 if (vm->vm_ifs[i].vif_fd != -1) { 1129 close(vm->vm_ifs[i].vif_fd); 1130 vm->vm_ifs[i].vif_fd = -1; 1131 } 1132 free(vm->vm_ifs[i].vif_name); 1133 free(vm->vm_ifs[i].vif_switch); 1134 free(vm->vm_ifs[i].vif_group); 1135 vm->vm_ifs[i].vif_name = NULL; 1136 vm->vm_ifs[i].vif_switch = NULL; 1137 vm->vm_ifs[i].vif_group = NULL; 1138 } 1139 if (vm->vm_kernel != -1) { 1140 close(vm->vm_kernel); 1141 vm->vm_kernel = -1; 1142 } 1143 if (vm->vm_cdrom != -1) { 1144 close(vm->vm_cdrom); 1145 vm->vm_cdrom = -1; 1146 } 1147 if (!keeptty) { 1148 vm_closetty(vm); 1149 vm->vm_uid = 0; 1150 } 1151} 1152 1153void 1154vm_remove(struct vmd_vm *vm, const char *caller) 1155{ 1156 struct privsep *ps = &env->vmd_ps; 1157 1158 if (vm == NULL) 1159 return; 1160 1161 log_debug("%s: %s %s removing vm %d from running config", 1162 __func__, ps->ps_title[privsep_process], caller, 1163 vm->vm_vmid); 1164 1165 TAILQ_REMOVE(env->vmd_vms, vm, vm_entry); 1166 1167 user_put(vm->vm_user); 1168 vm_stop(vm, 0, caller); 1169 free(vm); 1170} 1171 1172int 1173vm_register(struct privsep *ps, struct vmop_create_params *vmc, 1174 struct vmd_vm **ret_vm, uint32_t id, uid_t uid) 1175{ 1176 struct vmd_vm *vm = NULL, *vm_parent = NULL; 1177 struct vm_create_params *vcp = &vmc->vmc_params; 1178 struct vmop_owner *vmo = NULL; 1179 struct vmd_user *usr = NULL; 1180 uint32_t rng; 1181 unsigned int i, j; 1182 struct vmd_switch *sw; 1183 char *s; 1184 1185 /* Check if this is an instance of another VM */ 1186 if (vm_instance(ps, &vm_parent, vmc, uid) == -1) 1187 return (-1); 1188 1189 errno = 0; 1190 *ret_vm = NULL; 1191 1192 if ((vm = vm_getbyname(vcp->vcp_name)) != NULL || 1193 (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) { 1194 if (vm_checkperm(vm, &vm->vm_params.vmc_owner, 1195 uid) != 0) { 1196 errno = EPERM; 1197 goto fail; 1198 } 1199 *ret_vm = vm; 1200 errno = EALREADY; 1201 goto fail; 1202 } 1203 1204 if (vm_parent != NULL) 1205 vmo = &vm_parent->vm_params.vmc_insowner; 1206 1207 /* non-root users can only start existing VMs or instances */ 1208 if (vm_checkperm(NULL, vmo, uid) != 0) { 1209 log_warnx("permission denied"); 1210 errno = EPERM; 1211 goto fail; 1212 } 1213 if (vmc->vmc_flags == 0) { 1214 log_warnx("invalid configuration, no devices"); 1215 errno = VMD_DISK_MISSING; 1216 goto fail; 1217 } 1218 if (vcp->vcp_ncpus == 0) 1219 vcp->vcp_ncpus = 1; 1220 if (vcp->vcp_memranges[0].vmr_size == 0) 1221 vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY; 1222 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) { 1223 log_warnx("invalid number of CPUs"); 1224 goto fail; 1225 } else if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) { 1226 log_warnx("invalid number of disks"); 1227 goto fail; 1228 } else if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) { 1229 log_warnx("invalid number of interfaces"); 1230 goto fail; 1231 } else if (strlen(vcp->vcp_kernel) == 0 && 1232 vcp->vcp_ndisks == 0 && strlen(vcp->vcp_cdrom) == 0) { 1233 log_warnx("no kernel or disk/cdrom specified"); 1234 goto fail; 1235 } else if (strlen(vcp->vcp_name) == 0) { 1236 log_warnx("invalid VM name"); 1237 goto fail; 1238 } else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' || 1239 *vcp->vcp_name == '_') { 1240 log_warnx("invalid VM name"); 1241 goto fail; 1242 } else { 1243 for (s = vcp->vcp_name; *s != '\0'; ++s) { 1244 if (!(isalnum(*s) || *s == '.' || *s == '-' || 1245 *s == '_')) { 1246 log_warnx("invalid VM name"); 1247 goto fail; 1248 } 1249 } 1250 } 1251 1252 /* track active users */ 1253 if (uid != 0 && env->vmd_users != NULL && 1254 (usr = user_get(uid)) == NULL) { 1255 log_warnx("could not add user"); 1256 goto fail; 1257 } 1258 1259 if ((vm = calloc(1, sizeof(*vm))) == NULL) 1260 goto fail; 1261 1262 memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params)); 1263 vmc = &vm->vm_params; 1264 vcp = &vmc->vmc_params; 1265 vm->vm_pid = -1; 1266 vm->vm_tty = -1; 1267 vm->vm_receive_fd = -1; 1268 vm->vm_paused = 0; 1269 vm->vm_user = usr; 1270 1271 for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) 1272 for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) 1273 vm->vm_disks[i][j] = -1; 1274 for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) 1275 vm->vm_ifs[i].vif_fd = -1; 1276 for (i = 0; i < vcp->vcp_nnics; i++) { 1277 if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) { 1278 /* inherit per-interface flags from the switch */ 1279 vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK); 1280 } 1281 1282 /* 1283 * If the MAC address is zero, always randomize it in vmd(8) 1284 * because we cannot rely on the guest OS to do the right 1285 * thing like OpenBSD does. Based on ether_fakeaddr() 1286 * from the kernel, incremented by one to differentiate 1287 * the source. 1288 */ 1289 if (memcmp(zero_mac, &vcp->vcp_macs[i], ETHER_ADDR_LEN) == 0) { 1290 rng = arc4random(); 1291 vcp->vcp_macs[i][0] = 0xfe; 1292 vcp->vcp_macs[i][1] = 0xe1; 1293 vcp->vcp_macs[i][2] = 0xba + 1; 1294 vcp->vcp_macs[i][3] = 0xd0 | ((i + 1) & 0xf); 1295 vcp->vcp_macs[i][4] = rng; 1296 vcp->vcp_macs[i][5] = rng >> 8; 1297 } 1298 } 1299 vm->vm_kernel = -1; 1300 vm->vm_cdrom = -1; 1301 vm->vm_iev.ibuf.fd = -1; 1302 1303 if (++env->vmd_nvm == 0) 1304 fatalx("too many vms"); 1305 1306 /* Assign a new internal Id if not specified */ 1307 vm->vm_vmid = id == 0 ? env->vmd_nvm : id; 1308 1309 log_debug("%s: registering vm %d", __func__, vm->vm_vmid); 1310 TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry); 1311 1312 *ret_vm = vm; 1313 return (0); 1314 fail: 1315 if (errno == 0) 1316 errno = EINVAL; 1317 return (-1); 1318} 1319 1320int 1321vm_instance(struct privsep *ps, struct vmd_vm **vm_parent, 1322 struct vmop_create_params *vmc, uid_t uid) 1323{ 1324 char *name; 1325 struct vm_create_params *vcp = &vmc->vmc_params; 1326 struct vmop_create_params *vmcp; 1327 struct vm_create_params *vcpp; 1328 struct vmd_vm *vm = NULL; 1329 unsigned int i, j; 1330 uint32_t id; 1331 1332 /* return without error if the parent is NULL (nothing to inherit) */ 1333 if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 || 1334 (*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL) 1335 return (0); 1336 1337 errno = 0; 1338 vmcp = &(*vm_parent)->vm_params; 1339 vcpp = &vmcp->vmc_params; 1340 1341 /* Are we allowed to create an instance from this VM? */ 1342 if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) { 1343 log_warnx("vm \"%s\" no permission to create vm instance", 1344 vcpp->vcp_name); 1345 errno = ENAMETOOLONG; 1346 return (-1); 1347 } 1348 1349 id = vcp->vcp_id; 1350 name = vcp->vcp_name; 1351 1352 if ((vm = vm_getbyname(vcp->vcp_name)) != NULL || 1353 (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) { 1354 errno = EPROCLIM; 1355 return (-1); 1356 } 1357 1358 /* CPU */ 1359 if (vcp->vcp_ncpus == 0) 1360 vcp->vcp_ncpus = vcpp->vcp_ncpus; 1361 if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 && 1362 vcp->vcp_ncpus != vcpp->vcp_ncpus) { 1363 log_warnx("vm \"%s\" no permission to set cpus", name); 1364 errno = EPERM; 1365 return (-1); 1366 } 1367 1368 /* memory */ 1369 if (vcp->vcp_memranges[0].vmr_size == 0) 1370 vcp->vcp_memranges[0].vmr_size = 1371 vcpp->vcp_memranges[0].vmr_size; 1372 if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 && 1373 vcp->vcp_memranges[0].vmr_size != 1374 vcpp->vcp_memranges[0].vmr_size) { 1375 log_warnx("vm \"%s\" no permission to set memory", name); 1376 errno = EPERM; 1377 return (-1); 1378 } 1379 1380 /* disks cannot be inherited */ 1381 if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 && 1382 vcp->vcp_ndisks) { 1383 log_warnx("vm \"%s\" no permission to set disks", name); 1384 errno = EPERM; 1385 return (-1); 1386 } 1387 for (i = 0; i < vcp->vcp_ndisks; i++) { 1388 /* Check if this disk is already used in the parent */ 1389 for (j = 0; j < vcpp->vcp_ndisks; j++) { 1390 if (strcmp(vcp->vcp_disks[i], 1391 vcpp->vcp_disks[j]) == 0) { 1392 log_warnx("vm \"%s\" disk %s cannot be reused", 1393 name, vcp->vcp_disks[i]); 1394 errno = EBUSY; 1395 return (-1); 1396 } 1397 } 1398 vmc->vmc_checkaccess |= VMOP_CREATE_DISK; 1399 } 1400 1401 /* interfaces */ 1402 if (vcp->vcp_nnics > 0 && 1403 vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 && 1404 vcp->vcp_nnics != vcpp->vcp_nnics) { 1405 log_warnx("vm \"%s\" no permission to set interfaces", name); 1406 errno = EPERM; 1407 return (-1); 1408 } 1409 for (i = 0; i < vcpp->vcp_nnics; i++) { 1410 /* Interface got overwritten */ 1411 if (i < vcp->vcp_nnics) 1412 continue; 1413 1414 /* Copy interface from parent */ 1415 vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i]; 1416 (void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i], 1417 sizeof(vmc->vmc_ifnames[i])); 1418 (void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i], 1419 sizeof(vmc->vmc_ifswitch[i])); 1420 (void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i], 1421 sizeof(vmc->vmc_ifgroup[i])); 1422 memcpy(vcp->vcp_macs[i], vcpp->vcp_macs[i], 1423 sizeof(vcp->vcp_macs[i])); 1424 vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i]; 1425 vcp->vcp_nnics++; 1426 } 1427 for (i = 0; i < vcp->vcp_nnics; i++) { 1428 for (j = 0; j < vcpp->vcp_nnics; j++) { 1429 if (memcmp(zero_mac, vcp->vcp_macs[i], 1430 sizeof(vcp->vcp_macs[i])) != 0 && 1431 memcmp(vcpp->vcp_macs[i], vcp->vcp_macs[i], 1432 sizeof(vcp->vcp_macs[i])) != 0) { 1433 log_warnx("vm \"%s\" lladdr cannot be reused", 1434 name); 1435 errno = EBUSY; 1436 return (-1); 1437 } 1438 if (strlen(vmc->vmc_ifnames[i]) && 1439 strcmp(vmc->vmc_ifnames[i], 1440 vmcp->vmc_ifnames[j]) == 0) { 1441 log_warnx("vm \"%s\" %s cannot be reused", 1442 vmc->vmc_ifnames[i], name); 1443 errno = EBUSY; 1444 return (-1); 1445 } 1446 } 1447 } 1448 1449 /* kernel */ 1450 if (strlen(vcp->vcp_kernel) > 0) { 1451 if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) { 1452 log_warnx("vm \"%s\" no permission to set boot image", 1453 name); 1454 errno = EPERM; 1455 return (-1); 1456 } 1457 vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL; 1458 } else if (strlcpy(vcp->vcp_kernel, vcpp->vcp_kernel, 1459 sizeof(vcp->vcp_kernel)) >= sizeof(vcp->vcp_kernel)) { 1460 log_warnx("vm \"%s\" kernel name too long", name); 1461 errno = EINVAL; 1462 return (-1); 1463 } 1464 1465 /* cdrom */ 1466 if (strlen(vcp->vcp_cdrom) > 0) { 1467 if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) { 1468 log_warnx("vm \"%s\" no permission to set cdrom", name); 1469 errno = EPERM; 1470 return (-1); 1471 } 1472 vmc->vmc_checkaccess |= VMOP_CREATE_CDROM; 1473 } else if (strlcpy(vcp->vcp_cdrom, vcpp->vcp_cdrom, 1474 sizeof(vcp->vcp_cdrom)) >= sizeof(vcp->vcp_cdrom)) { 1475 log_warnx("vm \"%s\" cdrom name too long", name); 1476 errno = EINVAL; 1477 return (-1); 1478 } 1479 1480 /* user */ 1481 if (vmc->vmc_owner.uid == 0) 1482 vmc->vmc_owner.uid = vmcp->vmc_owner.uid; 1483 else if (vmc->vmc_owner.uid != uid && 1484 vmc->vmc_owner.uid != vmcp->vmc_owner.uid) { 1485 log_warnx("vm \"%s\" user mismatch", name); 1486 errno = EPERM; 1487 return (-1); 1488 } 1489 1490 /* group */ 1491 if (vmc->vmc_owner.gid == 0) 1492 vmc->vmc_owner.gid = vmcp->vmc_owner.gid; 1493 else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) { 1494 log_warnx("vm \"%s\" group mismatch", name); 1495 errno = EPERM; 1496 return (-1); 1497 } 1498 1499 /* child instances */ 1500 if (vmc->vmc_insflags) { 1501 log_warnx("vm \"%s\" cannot change instance permissions", name); 1502 errno = EPERM; 1503 return (-1); 1504 } 1505 if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) { 1506 vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid; 1507 vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid; 1508 vmc->vmc_insflags = vmcp->vmc_insflags; 1509 } else { 1510 vmc->vmc_insowner.gid = 0; 1511 vmc->vmc_insowner.uid = 0; 1512 vmc->vmc_insflags = 0; 1513 } 1514 1515 /* finished, remove instance flags */ 1516 vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE; 1517 1518 return (0); 1519} 1520 1521/* 1522 * vm_checkperm 1523 * 1524 * Checks if the user represented by the 'uid' parameter is allowed to 1525 * manipulate the VM described by the 'vm' parameter (or connect to said VM's 1526 * console.) 1527 * 1528 * Parameters: 1529 * vm: the VM whose permission is to be checked 1530 * vmo: the required uid/gid to be checked 1531 * uid: the user ID of the user making the request 1532 * 1533 * Return values: 1534 * 0: the permission should be granted 1535 * -1: the permission check failed (also returned if vm == null) 1536 */ 1537int 1538vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid) 1539{ 1540 struct group *gr; 1541 struct passwd *pw; 1542 char **grmem; 1543 1544 /* root has no restrictions */ 1545 if (uid == 0) 1546 return (0); 1547 1548 if (vmo == NULL) 1549 return (-1); 1550 1551 /* check user */ 1552 if (vm == NULL) { 1553 if (vmo->uid == uid) 1554 return (0); 1555 } else { 1556 /* 1557 * check user of running vm (the owner of a running vm can 1558 * be different to (or more specific than) the configured owner. 1559 */ 1560 if ((vm->vm_running && vm->vm_uid == uid) || 1561 (!vm->vm_running && vmo->uid == uid)) 1562 return (0); 1563 } 1564 1565 /* check groups */ 1566 if (vmo->gid != -1) { 1567 if ((pw = getpwuid(uid)) == NULL) 1568 return (-1); 1569 if (pw->pw_gid == vmo->gid) 1570 return (0); 1571 if ((gr = getgrgid(vmo->gid)) != NULL) { 1572 for (grmem = gr->gr_mem; *grmem; grmem++) 1573 if (strcmp(*grmem, pw->pw_name) == 0) 1574 return (0); 1575 } 1576 } 1577 1578 return (-1); 1579} 1580 1581/* 1582 * vm_checkinsflag 1583 * 1584 * Checks wheter the non-root user is allowed to set an instance option. 1585 * 1586 * Parameters: 1587 * vmc: the VM create parameters 1588 * flag: the flag to be checked 1589 * uid: the user ID of the user making the request 1590 * 1591 * Return values: 1592 * 0: the permission should be granted 1593 * -1: the permission check failed (also returned if vm == null) 1594 */ 1595int 1596vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid) 1597{ 1598 /* root has no restrictions */ 1599 if (uid == 0) 1600 return (0); 1601 1602 if ((vmc->vmc_insflags & flag) == 0) 1603 return (-1); 1604 1605 return (0); 1606} 1607 1608/* 1609 * vm_checkaccess 1610 * 1611 * Checks if the user represented by the 'uid' parameter is allowed to 1612 * access the file described by the 'path' parameter. 1613 * 1614 * Parameters: 1615 * fd: the file descriptor of the opened file 1616 * uflag: check if the userid has access to the file 1617 * uid: the user ID of the user making the request 1618 * amode: the access flags of R_OK and W_OK 1619 * 1620 * Return values: 1621 * 0: the permission should be granted 1622 * -1: the permission check failed 1623 */ 1624int 1625vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode) 1626{ 1627 struct group *gr; 1628 struct passwd *pw; 1629 char **grmem; 1630 struct stat st; 1631 mode_t mode; 1632 1633 if (fd == -1) 1634 return (-1); 1635 1636 /* 1637 * File has to be accessible and a regular file 1638 */ 1639 if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode)) 1640 return (-1); 1641 1642 /* root has no restrictions */ 1643 if (uid == 0 || uflag == 0) 1644 return (0); 1645 1646 /* check other */ 1647 mode = amode & W_OK ? S_IWOTH : 0; 1648 mode |= amode & R_OK ? S_IROTH : 0; 1649 if ((st.st_mode & mode) == mode) 1650 return (0); 1651 1652 /* check user */ 1653 mode = amode & W_OK ? S_IWUSR : 0; 1654 mode |= amode & R_OK ? S_IRUSR : 0; 1655 if (uid == st.st_uid && (st.st_mode & mode) == mode) 1656 return (0); 1657 1658 /* check groups */ 1659 mode = amode & W_OK ? S_IWGRP : 0; 1660 mode |= amode & R_OK ? S_IRGRP : 0; 1661 if ((st.st_mode & mode) != mode) 1662 return (-1); 1663 if ((pw = getpwuid(uid)) == NULL) 1664 return (-1); 1665 if (pw->pw_gid == st.st_gid) 1666 return (0); 1667 if ((gr = getgrgid(st.st_gid)) != NULL) { 1668 for (grmem = gr->gr_mem; *grmem; grmem++) 1669 if (strcmp(*grmem, pw->pw_name) == 0) 1670 return (0); 1671 } 1672 1673 return (-1); 1674} 1675 1676int 1677vm_opentty(struct vmd_vm *vm) 1678{ 1679 struct ptmget ptm; 1680 struct stat st; 1681 struct group *gr; 1682 uid_t uid; 1683 gid_t gid; 1684 mode_t mode; 1685 int on; 1686 1687 /* 1688 * Open tty with pre-opened PTM fd 1689 */ 1690 if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1)) 1691 return (-1); 1692 1693 /* 1694 * We use user ioctl(2) mode to pass break commands. 1695 */ 1696 on = 1; 1697 if (ioctl(ptm.cfd, TIOCUCNTL, &on)) 1698 fatal("could not enable user ioctl mode"); 1699 1700 vm->vm_tty = ptm.cfd; 1701 close(ptm.sfd); 1702 if ((vm->vm_ttyname = strdup(ptm.sn)) == NULL) 1703 goto fail; 1704 1705 uid = vm->vm_uid; 1706 gid = vm->vm_params.vmc_owner.gid; 1707 1708 if (vm->vm_params.vmc_owner.gid != -1) { 1709 mode = 0660; 1710 } else if ((gr = getgrnam("tty")) != NULL) { 1711 gid = gr->gr_gid; 1712 mode = 0620; 1713 } else { 1714 mode = 0600; 1715 gid = 0; 1716 } 1717 1718 log_debug("%s: vm %s tty %s uid %d gid %d mode %o", 1719 __func__, vm->vm_params.vmc_params.vcp_name, 1720 vm->vm_ttyname, uid, gid, mode); 1721 1722 /* 1723 * Change ownership and mode of the tty as required. 1724 * Loosely based on the implementation of sshpty.c 1725 */ 1726 if (stat(vm->vm_ttyname, &st) == -1) 1727 goto fail; 1728 1729 if (st.st_uid != uid || st.st_gid != gid) { 1730 if (chown(vm->vm_ttyname, uid, gid) == -1) { 1731 log_warn("chown %s %d %d failed, uid %d", 1732 vm->vm_ttyname, uid, gid, getuid()); 1733 1734 /* Ignore failure on read-only filesystems */ 1735 if (!((errno == EROFS) && 1736 (st.st_uid == uid || st.st_uid == 0))) 1737 goto fail; 1738 } 1739 } 1740 1741 if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) { 1742 if (chmod(vm->vm_ttyname, mode) == -1) { 1743 log_warn("chmod %s %o failed, uid %d", 1744 vm->vm_ttyname, mode, getuid()); 1745 1746 /* Ignore failure on read-only filesystems */ 1747 if (!((errno == EROFS) && 1748 (st.st_uid == uid || st.st_uid == 0))) 1749 goto fail; 1750 } 1751 } 1752 1753 return (0); 1754 fail: 1755 vm_closetty(vm); 1756 return (-1); 1757} 1758 1759void 1760vm_closetty(struct vmd_vm *vm) 1761{ 1762 if (vm->vm_tty != -1) { 1763 /* Release and close the tty */ 1764 if (fchown(vm->vm_tty, 0, 0) == -1) 1765 log_warn("chown %s 0 0 failed", vm->vm_ttyname); 1766 if (fchmod(vm->vm_tty, 0666) == -1) 1767 log_warn("chmod %s 0666 failed", vm->vm_ttyname); 1768 close(vm->vm_tty); 1769 vm->vm_tty = -1; 1770 } 1771 free(vm->vm_ttyname); 1772 vm->vm_ttyname = NULL; 1773} 1774 1775void 1776switch_remove(struct vmd_switch *vsw) 1777{ 1778 if (vsw == NULL) 1779 return; 1780 1781 TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry); 1782 1783 free(vsw->sw_group); 1784 free(vsw->sw_name); 1785 free(vsw); 1786} 1787 1788struct vmd_switch * 1789switch_getbyname(const char *name) 1790{ 1791 struct vmd_switch *vsw; 1792 1793 if (name == NULL) 1794 return (NULL); 1795 TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) { 1796 if (strcmp(vsw->sw_name, name) == 0) 1797 return (vsw); 1798 } 1799 1800 return (NULL); 1801} 1802 1803struct vmd_user * 1804user_get(uid_t uid) 1805{ 1806 struct vmd_user *usr; 1807 1808 if (uid == 0) 1809 return (NULL); 1810 1811 /* first try to find an existing user */ 1812 TAILQ_FOREACH(usr, env->vmd_users, usr_entry) { 1813 if (usr->usr_id.uid == uid) 1814 goto done; 1815 } 1816 1817 if ((usr = calloc(1, sizeof(*usr))) == NULL) { 1818 log_warn("could not allocate user"); 1819 return (NULL); 1820 } 1821 1822 usr->usr_id.uid = uid; 1823 usr->usr_id.gid = -1; 1824 TAILQ_INSERT_TAIL(env->vmd_users, usr, usr_entry); 1825 1826 done: 1827 DPRINTF("%s: uid %d #%d +", 1828 __func__, usr->usr_id.uid, usr->usr_refcnt + 1); 1829 usr->usr_refcnt++; 1830 1831 return (usr); 1832} 1833 1834void 1835user_put(struct vmd_user *usr) 1836{ 1837 if (usr == NULL) 1838 return; 1839 1840 DPRINTF("%s: uid %d #%d -", 1841 __func__, usr->usr_id.uid, usr->usr_refcnt - 1); 1842 1843 if (--usr->usr_refcnt > 0) 1844 return; 1845 1846 TAILQ_REMOVE(env->vmd_users, usr, usr_entry); 1847 free(usr); 1848} 1849 1850void 1851user_inc(struct vm_create_params *vcp, struct vmd_user *usr, int inc) 1852{ 1853 char mem[FMT_SCALED_STRSIZE]; 1854 1855 if (usr == NULL) 1856 return; 1857 1858 /* increment or decrement counters */ 1859 inc = inc ? 1 : -1; 1860 1861 usr->usr_maxcpu += vcp->vcp_ncpus * inc; 1862 usr->usr_maxmem += vcp->vcp_memranges[0].vmr_size * inc; 1863 usr->usr_maxifs += vcp->vcp_nnics * inc; 1864 1865 if (log_getverbose() > 1) { 1866 (void)fmt_scaled(usr->usr_maxmem * 1024 * 1024, mem); 1867 log_debug("%s: %c uid %d ref %d cpu %llu mem %s ifs %llu", 1868 __func__, inc == 1 ? '+' : '-', 1869 usr->usr_id.uid, usr->usr_refcnt, 1870 usr->usr_maxcpu, mem, usr->usr_maxifs); 1871 } 1872} 1873 1874int 1875user_checklimit(struct vmd_user *usr, struct vm_create_params *vcp) 1876{ 1877 const char *limit = ""; 1878 1879 /* XXX make the limits configurable */ 1880 if (usr->usr_maxcpu > VM_DEFAULT_USER_MAXCPU) { 1881 limit = "cpu "; 1882 goto fail; 1883 } 1884 if (usr->usr_maxmem > VM_DEFAULT_USER_MAXMEM) { 1885 limit = "memory "; 1886 goto fail; 1887 } 1888 if (usr->usr_maxifs > VM_DEFAULT_USER_MAXIFS) { 1889 limit = "interface "; 1890 goto fail; 1891 } 1892 1893 return (0); 1894 1895 fail: 1896 log_warnx("%s: user %d %slimit reached", vcp->vcp_name, 1897 usr->usr_id.uid, limit); 1898 return (-1); 1899} 1900 1901char * 1902get_string(uint8_t *ptr, size_t len) 1903{ 1904 size_t i; 1905 1906 for (i = 0; i < len; i++) 1907 if (!isprint(ptr[i])) 1908 break; 1909 1910 return strndup(ptr, i); 1911} 1912 1913uint32_t 1914prefixlen2mask(uint8_t prefixlen) 1915{ 1916 if (prefixlen == 0) 1917 return (0); 1918 1919 if (prefixlen > 32) 1920 prefixlen = 32; 1921 1922 return (htonl(0xffffffff << (32 - prefixlen))); 1923} 1924 1925void 1926prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask) 1927{ 1928 struct in6_addr s6; 1929 int i; 1930 1931 if (prefixlen > 128) 1932 prefixlen = 128; 1933 1934 memset(&s6, 0, sizeof(s6)); 1935 for (i = 0; i < prefixlen / 8; i++) 1936 s6.s6_addr[i] = 0xff; 1937 i = prefixlen % 8; 1938 if (i) 1939 s6.s6_addr[prefixlen / 8] = 0xff00 >> i; 1940 1941 memcpy(mask, &s6, sizeof(s6)); 1942} 1943 1944void 1945getmonotime(struct timeval *tv) 1946{ 1947 struct timespec ts; 1948 1949 if (clock_gettime(CLOCK_MONOTONIC, &ts)) 1950 fatal("clock_gettime"); 1951 1952 TIMESPEC_TO_TIMEVAL(tv, &ts); 1953} 1954