vmd.c revision 1.98
1/* $OpenBSD: vmd.c,v 1.98 2018/07/15 14:36:54 reyk Exp $ */ 2 3/* 4 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19#include <sys/param.h> /* nitems */ 20#include <sys/queue.h> 21#include <sys/wait.h> 22#include <sys/cdefs.h> 23#include <sys/stat.h> 24#include <sys/tty.h> 25#include <sys/ttycom.h> 26#include <sys/ioctl.h> 27 28#include <stdio.h> 29#include <stdlib.h> 30#include <string.h> 31#include <termios.h> 32#include <errno.h> 33#include <event.h> 34#include <fcntl.h> 35#include <pwd.h> 36#include <signal.h> 37#include <syslog.h> 38#include <unistd.h> 39#include <util.h> 40#include <ctype.h> 41#include <pwd.h> 42#include <grp.h> 43 44#include <machine/specialreg.h> 45#include <machine/vmmvar.h> 46 47#include "proc.h" 48#include "atomicio.h" 49#include "vmd.h" 50 51__dead void usage(void); 52 53int main(int, char **); 54int vmd_configure(void); 55void vmd_sighdlr(int sig, short event, void *arg); 56void vmd_shutdown(void); 57int vmd_control_run(void); 58int vmd_dispatch_control(int, struct privsep_proc *, struct imsg *); 59int vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *); 60int vmd_check_vmh(struct vm_dump_header *); 61 62int vm_instance(struct privsep *, struct vmd_vm **, 63 struct vmop_create_params *, uid_t); 64int vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t); 65 66struct vmd *env; 67 68static struct privsep_proc procs[] = { 69 /* Keep "priv" on top as procs[0] */ 70 { "priv", PROC_PRIV, NULL, priv }, 71 { "control", PROC_CONTROL, vmd_dispatch_control, control }, 72 { "vmm", PROC_VMM, vmd_dispatch_vmm, vmm, vmm_shutdown }, 73}; 74 75/* For the privileged process */ 76static struct privsep_proc *proc_priv = &procs[0]; 77static struct passwd proc_privpw; 78static const uint8_t zero_mac[ETHER_ADDR_LEN]; 79 80int 81vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg) 82{ 83 struct privsep *ps = p->p_ps; 84 int res = 0, ret = 0, cmd = 0, verbose; 85 unsigned int v = 0, flags; 86 struct vmop_create_params vmc; 87 struct vmop_id vid; 88 struct vmop_result vmr; 89 struct vm_dump_header vmh; 90 struct vmd_vm *vm = NULL; 91 char *str = NULL; 92 uint32_t id = 0; 93 struct control_sock *rcs; 94 95 switch (imsg->hdr.type) { 96 case IMSG_VMDOP_START_VM_REQUEST: 97 IMSG_SIZE_CHECK(imsg, &vmc); 98 memcpy(&vmc, imsg->data, sizeof(vmc)); 99 ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid); 100 if (vmc.vmc_flags == 0) { 101 /* start an existing VM with pre-configured options */ 102 if (!(ret == -1 && errno == EALREADY && 103 vm->vm_running == 0)) { 104 res = errno; 105 cmd = IMSG_VMDOP_START_VM_RESPONSE; 106 } 107 } else if (ret != 0) { 108 res = errno; 109 cmd = IMSG_VMDOP_START_VM_RESPONSE; 110 } 111 if (res == 0 && 112 config_setvm(ps, vm, 113 imsg->hdr.peerid, vm->vm_params.vmc_owner.uid) == -1) { 114 res = errno; 115 cmd = IMSG_VMDOP_START_VM_RESPONSE; 116 } 117 break; 118 case IMSG_VMDOP_TERMINATE_VM_REQUEST: 119 IMSG_SIZE_CHECK(imsg, &vid); 120 memcpy(&vid, imsg->data, sizeof(vid)); 121 flags = vid.vid_flags; 122 123 if ((id = vid.vid_id) == 0) { 124 /* Lookup vm (id) by name */ 125 if ((vm = vm_getbyname(vid.vid_name)) == NULL) { 126 res = ENOENT; 127 cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; 128 break; 129 } else if (vm->vm_shutdown && 130 (flags & VMOP_FORCE) == 0) { 131 res = EALREADY; 132 cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; 133 break; 134 } else if (vm->vm_running == 0) { 135 res = EINVAL; 136 cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; 137 break; 138 } 139 id = vm->vm_vmid; 140 } else if ((vm = vm_getbyvmid(id)) == NULL) { 141 res = ENOENT; 142 cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; 143 break; 144 } 145 if (vm_checkperm(vm, &vm->vm_params.vmc_owner, 146 vid.vid_uid) != 0) { 147 res = EPERM; 148 cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; 149 break; 150 } 151 152 memset(&vid, 0, sizeof(vid)); 153 vid.vid_id = id; 154 vid.vid_flags = flags; 155 if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type, 156 imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1) 157 return (-1); 158 break; 159 case IMSG_VMDOP_GET_INFO_VM_REQUEST: 160 proc_forward_imsg(ps, imsg, PROC_VMM, -1); 161 break; 162 case IMSG_VMDOP_LOAD: 163 IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */ 164 str = get_string((uint8_t *)imsg->data, 165 IMSG_DATA_SIZE(imsg)); 166 case IMSG_VMDOP_RELOAD: 167 if (vmd_reload(0, str) == -1) 168 cmd = IMSG_CTL_FAIL; 169 else 170 cmd = IMSG_CTL_OK; 171 free(str); 172 break; 173 case IMSG_CTL_RESET: 174 IMSG_SIZE_CHECK(imsg, &v); 175 memcpy(&v, imsg->data, sizeof(v)); 176 if (vmd_reload(v, NULL) == -1) 177 cmd = IMSG_CTL_FAIL; 178 else 179 cmd = IMSG_CTL_OK; 180 break; 181 case IMSG_CTL_VERBOSE: 182 IMSG_SIZE_CHECK(imsg, &verbose); 183 memcpy(&verbose, imsg->data, sizeof(verbose)); 184 log_setverbose(verbose); 185 186 proc_forward_imsg(ps, imsg, PROC_VMM, -1); 187 proc_forward_imsg(ps, imsg, PROC_PRIV, -1); 188 cmd = IMSG_CTL_OK; 189 break; 190 case IMSG_VMDOP_PAUSE_VM: 191 case IMSG_VMDOP_UNPAUSE_VM: 192 IMSG_SIZE_CHECK(imsg, &vid); 193 memcpy(&vid, imsg->data, sizeof(vid)); 194 if (vid.vid_id == 0) { 195 if ((vm = vm_getbyname(vid.vid_name)) == NULL) { 196 res = ENOENT; 197 cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE; 198 break; 199 } else { 200 vid.vid_id = vm->vm_vmid; 201 } 202 } else if ((vm = vm_getbyid(vid.vid_id)) == NULL) { 203 res = ENOENT; 204 cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE; 205 break; 206 } 207 if (vm_checkperm(vm, &vm->vm_params.vmc_owner, 208 vid.vid_uid) != 0) { 209 res = EPERM; 210 cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE; 211 break; 212 } 213 proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type, 214 imsg->hdr.peerid, -1, &vid, sizeof(vid)); 215 break; 216 case IMSG_VMDOP_SEND_VM_REQUEST: 217 IMSG_SIZE_CHECK(imsg, &vid); 218 memcpy(&vid, imsg->data, sizeof(vid)); 219 id = vid.vid_id; 220 if (vid.vid_id == 0) { 221 if ((vm = vm_getbyname(vid.vid_name)) == NULL) { 222 res = ENOENT; 223 cmd = IMSG_VMDOP_SEND_VM_RESPONSE; 224 close(imsg->fd); 225 break; 226 } else { 227 vid.vid_id = vm->vm_vmid; 228 } 229 } else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) { 230 res = ENOENT; 231 cmd = IMSG_VMDOP_SEND_VM_RESPONSE; 232 close(imsg->fd); 233 break; 234 } else { 235 } 236 vmr.vmr_id = vid.vid_id; 237 log_debug("%s: sending fd to vmm", __func__); 238 proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type, 239 imsg->hdr.peerid, imsg->fd, &vid, sizeof(vid)); 240 break; 241 case IMSG_VMDOP_RECEIVE_VM_REQUEST: 242 IMSG_SIZE_CHECK(imsg, &vid); 243 memcpy(&vid, imsg->data, sizeof(vid)); 244 if (imsg->fd == -1) { 245 log_warnx("%s: invalid fd", __func__); 246 return (-1); 247 } 248 if (atomicio(read, imsg->fd, &vmh, sizeof(vmh)) != 249 sizeof(vmh)) { 250 log_warnx("%s: error reading vmh from received vm", 251 __func__); 252 res = EIO; 253 close(imsg->fd); 254 cmd = IMSG_VMDOP_START_VM_RESPONSE; 255 break; 256 } 257 258 if (vmd_check_vmh(&vmh)) { 259 res = ENOENT; 260 close(imsg->fd); 261 cmd = IMSG_VMDOP_START_VM_RESPONSE; 262 break; 263 } 264 if (atomicio(read, imsg->fd, &vmc, sizeof(vmc)) != 265 sizeof(vmc)) { 266 log_warnx("%s: error reading vmc from received vm", 267 __func__); 268 res = EIO; 269 close(imsg->fd); 270 cmd = IMSG_VMDOP_START_VM_RESPONSE; 271 break; 272 } 273 strlcpy(vmc.vmc_params.vcp_name, vid.vid_name, 274 sizeof(vmc.vmc_params.vcp_name)); 275 vmc.vmc_params.vcp_id = 0; 276 277 ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid); 278 if (ret != 0) { 279 res = errno; 280 cmd = IMSG_VMDOP_START_VM_RESPONSE; 281 close(imsg->fd); 282 } else { 283 vm->vm_received = 1; 284 config_setvm(ps, vm, imsg->hdr.peerid, 285 vmc.vmc_owner.uid); 286 log_debug("%s: sending fd to vmm", __func__); 287 proc_compose_imsg(ps, PROC_VMM, -1, 288 IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, imsg->fd, 289 NULL, 0); 290 } 291 break; 292 case IMSG_VMDOP_DONE: 293 control_reset(&ps->ps_csock); 294 TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry) 295 control_reset(rcs); 296 cmd = 0; 297 break; 298 default: 299 return (-1); 300 } 301 302 switch (cmd) { 303 case 0: 304 break; 305 case IMSG_VMDOP_START_VM_RESPONSE: 306 case IMSG_VMDOP_TERMINATE_VM_RESPONSE: 307 memset(&vmr, 0, sizeof(vmr)); 308 vmr.vmr_result = res; 309 vmr.vmr_id = id; 310 if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd, 311 imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1) 312 return (-1); 313 break; 314 default: 315 if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd, 316 imsg->hdr.peerid, -1, &res, sizeof(res)) == -1) 317 return (-1); 318 break; 319 } 320 321 return (0); 322} 323 324int 325vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg) 326{ 327 struct vmop_result vmr; 328 struct privsep *ps = p->p_ps; 329 int res = 0; 330 struct vmd_vm *vm; 331 struct vm_create_params *vcp; 332 struct vmop_info_result vir; 333 334 switch (imsg->hdr.type) { 335 case IMSG_VMDOP_PAUSE_VM_RESPONSE: 336 IMSG_SIZE_CHECK(imsg, &vmr); 337 memcpy(&vmr, imsg->data, sizeof(vmr)); 338 if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) 339 break; 340 proc_compose_imsg(ps, PROC_CONTROL, -1, 341 imsg->hdr.type, imsg->hdr.peerid, -1, 342 imsg->data, sizeof(imsg->data)); 343 log_info("%s: paused vm %d successfully", 344 vm->vm_params.vmc_params.vcp_name, 345 vm->vm_vmid); 346 break; 347 case IMSG_VMDOP_UNPAUSE_VM_RESPONSE: 348 IMSG_SIZE_CHECK(imsg, &vmr); 349 memcpy(&vmr, imsg->data, sizeof(vmr)); 350 if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) 351 break; 352 proc_compose_imsg(ps, PROC_CONTROL, -1, 353 imsg->hdr.type, imsg->hdr.peerid, -1, 354 imsg->data, sizeof(imsg->data)); 355 log_info("%s: unpaused vm %d successfully.", 356 vm->vm_params.vmc_params.vcp_name, 357 vm->vm_vmid); 358 break; 359 case IMSG_VMDOP_START_VM_RESPONSE: 360 IMSG_SIZE_CHECK(imsg, &vmr); 361 memcpy(&vmr, imsg->data, sizeof(vmr)); 362 if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) 363 break; 364 vm->vm_pid = vmr.vmr_pid; 365 vcp = &vm->vm_params.vmc_params; 366 vcp->vcp_id = vmr.vmr_id; 367 368 /* 369 * If the peerid is not -1, forward the response back to the 370 * the control socket. If it is -1, the request originated 371 * from the parent, not the control socket. 372 */ 373 if (vm->vm_peerid != (uint32_t)-1) { 374 (void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname, 375 sizeof(vmr.vmr_ttyname)); 376 if (proc_compose_imsg(ps, PROC_CONTROL, -1, 377 imsg->hdr.type, vm->vm_peerid, -1, 378 &vmr, sizeof(vmr)) == -1) { 379 errno = vmr.vmr_result; 380 log_warn("%s: failed to foward vm result", 381 vcp->vcp_name); 382 vm_remove(vm, __func__); 383 return (-1); 384 } 385 } 386 387 if (vmr.vmr_result) { 388 errno = vmr.vmr_result; 389 log_warn("%s: failed to start vm", vcp->vcp_name); 390 vm_remove(vm, __func__); 391 break; 392 } 393 394 /* Now configure all the interfaces */ 395 if (vm_priv_ifconfig(ps, vm) == -1) { 396 log_warn("%s: failed to configure vm", vcp->vcp_name); 397 vm_remove(vm, __func__); 398 break; 399 } 400 401 log_info("%s: started vm %d successfully, tty %s", 402 vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname); 403 break; 404 case IMSG_VMDOP_TERMINATE_VM_RESPONSE: 405 IMSG_SIZE_CHECK(imsg, &vmr); 406 memcpy(&vmr, imsg->data, sizeof(vmr)); 407 DPRINTF("%s: forwarding TERMINATE VM for vm id %d", 408 __func__, vmr.vmr_id); 409 proc_forward_imsg(ps, imsg, PROC_CONTROL, -1); 410 if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) 411 break; 412 if (vmr.vmr_result == 0) { 413 /* Mark VM as shutting down */ 414 vm->vm_shutdown = 1; 415 } 416 break; 417 case IMSG_VMDOP_SEND_VM_RESPONSE: 418 IMSG_SIZE_CHECK(imsg, &vmr); 419 memcpy(&vmr, imsg->data, sizeof(vmr)); 420 if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) 421 break; 422 if (!vmr.vmr_result) 423 log_info("%s: sent vm %d successfully.", 424 vm->vm_params.vmc_params.vcp_name, 425 vm->vm_vmid); 426 case IMSG_VMDOP_TERMINATE_VM_EVENT: 427 IMSG_SIZE_CHECK(imsg, &vmr); 428 memcpy(&vmr, imsg->data, sizeof(vmr)); 429 DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d", 430 __func__, vmr.vmr_id, vmr.vmr_result); 431 if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) { 432 log_debug("%s: vm %d is no longer available", 433 __func__, vmr.vmr_id); 434 break; 435 } 436 if (vmr.vmr_result != EAGAIN) { 437 if (vm->vm_from_config) 438 vm_stop(vm, 0, __func__); 439 else 440 vm_remove(vm, __func__); 441 } else { 442 /* Stop VM instance but keep the tty open */ 443 vm_stop(vm, 1, __func__); 444 config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid); 445 } 446 447 /* Send a response if a control client is waiting for it */ 448 if (imsg->hdr.peerid != (uint32_t)-1) { 449 /* the error is meaningless for deferred responses */ 450 vmr.vmr_result = 0; 451 452 if (proc_compose_imsg(ps, PROC_CONTROL, -1, 453 IMSG_VMDOP_TERMINATE_VM_RESPONSE, 454 imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1) 455 return (-1); 456 } 457 break; 458 case IMSG_VMDOP_GET_INFO_VM_DATA: 459 IMSG_SIZE_CHECK(imsg, &vir); 460 memcpy(&vir, imsg->data, sizeof(vir)); 461 if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) { 462 memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname)); 463 if (vm->vm_ttyname != NULL) 464 strlcpy(vir.vir_ttyname, vm->vm_ttyname, 465 sizeof(vir.vir_ttyname)); 466 if (vm->vm_shutdown) { 467 /* XXX there might be a nicer way */ 468 (void)strlcat(vir.vir_info.vir_name, 469 " - stopping", 470 sizeof(vir.vir_info.vir_name)); 471 } 472 /* get the user id who started the vm */ 473 vir.vir_uid = vm->vm_uid; 474 vir.vir_gid = vm->vm_params.vmc_owner.gid; 475 } 476 if (proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type, 477 imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) { 478 log_debug("%s: GET_INFO_VM failed for vm %d, removing", 479 __func__, vm->vm_vmid); 480 vm_remove(vm, __func__); 481 return (-1); 482 } 483 break; 484 case IMSG_VMDOP_GET_INFO_VM_END_DATA: 485 /* 486 * PROC_VMM has responded with the *running* VMs, now we 487 * append the others. These use the special value 0 for their 488 * kernel id to indicate that they are not running. 489 */ 490 TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { 491 if (!vm->vm_running) { 492 memset(&vir, 0, sizeof(vir)); 493 vir.vir_info.vir_id = vm->vm_vmid; 494 strlcpy(vir.vir_info.vir_name, 495 vm->vm_params.vmc_params.vcp_name, 496 VMM_MAX_NAME_LEN); 497 vir.vir_info.vir_memory_size = 498 vm->vm_params.vmc_params. 499 vcp_memranges[0].vmr_size; 500 vir.vir_info.vir_ncpus = 501 vm->vm_params.vmc_params.vcp_ncpus; 502 /* get the configured user id for this vm */ 503 vir.vir_uid = vm->vm_params.vmc_owner.uid; 504 vir.vir_gid = vm->vm_params.vmc_owner.gid; 505 if (proc_compose_imsg(ps, PROC_CONTROL, -1, 506 IMSG_VMDOP_GET_INFO_VM_DATA, 507 imsg->hdr.peerid, -1, &vir, 508 sizeof(vir)) == -1) { 509 log_debug("%s: GET_INFO_VM_END failed", 510 __func__); 511 vm_remove(vm, __func__); 512 return (-1); 513 } 514 } 515 } 516 IMSG_SIZE_CHECK(imsg, &res); 517 proc_forward_imsg(ps, imsg, PROC_CONTROL, -1); 518 break; 519 default: 520 return (-1); 521 } 522 523 return (0); 524} 525 526int 527vmd_check_vmh(struct vm_dump_header *vmh) 528{ 529 int i; 530 unsigned int code, leaf; 531 unsigned int a, b, c, d; 532 533 534 if (vmh->vmh_version != VM_DUMP_VERSION) { 535 log_warnx("%s: incompatible dump version", __func__); 536 return (-1); 537 } 538 539 for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { 540 code = vmh->vmh_cpuids[i].code; 541 leaf = vmh->vmh_cpuids[i].leaf; 542 if (leaf != 0x00) { 543 log_debug("%s: invalid leaf 0x%x for code 0x%x", 544 __func__, leaf, code); 545 return (-1); 546 } 547 548 switch (code) { 549 case 0x00: 550 CPUID_LEAF(code, leaf, a, b, c, d); 551 if (vmh->vmh_cpuids[i].a > a) { 552 log_debug("%s: incompatible cpuid level", 553 __func__); 554 return (-1); 555 } 556 if (!(vmh->vmh_cpuids[i].b == b && 557 vmh->vmh_cpuids[i].c == c && 558 vmh->vmh_cpuids[i].d == d)) { 559 log_debug("%s: incompatible cpu brand", 560 __func__); 561 return (-1); 562 } 563 break; 564 565 case 0x01: 566 CPUID_LEAF(code, leaf, a, b, c, d); 567 if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) != 568 (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) { 569 log_debug("%s: incompatible cpu features " 570 "code: 0x%x leaf: 0x%x reg: c", __func__, 571 code, leaf); 572 return (-1); 573 } 574 if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) != 575 (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) { 576 log_debug("%s: incompatible cpu features " 577 "code: 0x%x leaf: 0x%x reg: d", __func__, 578 code, leaf); 579 return (-1); 580 } 581 break; 582 583 case 0x07: 584 CPUID_LEAF(code, leaf, a, b, c, d); 585 if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) != 586 (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) { 587 log_debug("%s: incompatible cpu features " 588 "code: 0x%x leaf: 0x%x reg: c", __func__, 589 code, leaf); 590 return (-1); 591 } 592 if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) != 593 (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) { 594 log_debug("%s: incompatible cpu features " 595 "code: 0x%x leaf: 0x%x reg: d", __func__, 596 code, leaf); 597 return (-1); 598 } 599 break; 600 601 case 0x0d: 602 CPUID_LEAF(code, leaf, a, b, c, d); 603 if (vmh->vmh_cpuids[i].b > b) { 604 log_debug("%s: incompatible cpu: insufficient " 605 "max save area for enabled XCR0 features", 606 __func__); 607 return (-1); 608 } 609 if (vmh->vmh_cpuids[i].c > c) { 610 log_debug("%s: incompatible cpu: insufficient " 611 "max save area for supported XCR0 features", 612 __func__); 613 return (-1); 614 } 615 break; 616 617 case 0x80000001: 618 CPUID_LEAF(code, leaf, a, b, c, d); 619 if ((vmh->vmh_cpuids[i].a & a) != 620 vmh->vmh_cpuids[i].a) { 621 log_debug("%s: incompatible cpu features " 622 "code: 0x%x leaf: 0x%x reg: a", __func__, 623 code, leaf); 624 return (-1); 625 } 626 if ((vmh->vmh_cpuids[i].c & c) != 627 vmh->vmh_cpuids[i].c) { 628 log_debug("%s: incompatible cpu features " 629 "code: 0x%x leaf: 0x%x reg: c", __func__, 630 code, leaf); 631 return (-1); 632 } 633 if ((vmh->vmh_cpuids[i].d & d) != 634 vmh->vmh_cpuids[i].d) { 635 log_debug("%s: incompatible cpu features " 636 "code: 0x%x leaf: 0x%x reg: d", __func__, 637 code, leaf); 638 return (-1); 639 } 640 break; 641 642 default: 643 log_debug("%s: unknown code 0x%x", __func__, code); 644 return (-1); 645 } 646 } 647 648 return (0); 649} 650 651void 652vmd_sighdlr(int sig, short event, void *arg) 653{ 654 if (privsep_process != PROC_PARENT) 655 return; 656 log_debug("%s: handling signal", __func__); 657 658 switch (sig) { 659 case SIGHUP: 660 log_info("%s: reload requested with SIGHUP", __func__); 661 662 /* 663 * This is safe because libevent uses async signal handlers 664 * that run in the event loop and not in signal context. 665 */ 666 (void)vmd_reload(0, NULL); 667 break; 668 case SIGPIPE: 669 log_info("%s: ignoring SIGPIPE", __func__); 670 break; 671 case SIGUSR1: 672 log_info("%s: ignoring SIGUSR1", __func__); 673 break; 674 case SIGTERM: 675 case SIGINT: 676 vmd_shutdown(); 677 break; 678 default: 679 fatalx("unexpected signal"); 680 } 681} 682 683__dead void 684usage(void) 685{ 686 extern char *__progname; 687 fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n", 688 __progname); 689 exit(1); 690} 691 692int 693main(int argc, char **argv) 694{ 695 struct privsep *ps; 696 int ch; 697 const char *conffile = VMD_CONF; 698 enum privsep_procid proc_id = PROC_PARENT; 699 int proc_instance = 0; 700 const char *errp, *title = NULL; 701 int argc0 = argc; 702 703 log_init(0, LOG_DAEMON); 704 705 if ((env = calloc(1, sizeof(*env))) == NULL) 706 fatal("calloc: env"); 707 708 while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) { 709 switch (ch) { 710 case 'D': 711 if (cmdline_symset(optarg) < 0) 712 log_warnx("could not parse macro definition %s", 713 optarg); 714 break; 715 case 'd': 716 env->vmd_debug = 2; 717 break; 718 case 'f': 719 conffile = optarg; 720 break; 721 case 'v': 722 env->vmd_verbose++; 723 break; 724 case 'n': 725 env->vmd_noaction = 1; 726 break; 727 case 'P': 728 title = optarg; 729 proc_id = proc_getid(procs, nitems(procs), title); 730 if (proc_id == PROC_MAX) 731 fatalx("invalid process name"); 732 break; 733 case 'I': 734 proc_instance = strtonum(optarg, 0, 735 PROC_MAX_INSTANCES, &errp); 736 if (errp) 737 fatalx("invalid process instance"); 738 break; 739 default: 740 usage(); 741 } 742 } 743 744 argc -= optind; 745 if (argc > 0) 746 usage(); 747 748 if (env->vmd_noaction && !env->vmd_debug) 749 env->vmd_debug = 1; 750 751 /* check for root privileges */ 752 if (env->vmd_noaction == 0) { 753 if (geteuid()) 754 fatalx("need root privileges"); 755 } 756 757 ps = &env->vmd_ps; 758 ps->ps_env = env; 759 env->vmd_fd = -1; 760 761 if (config_init(env) == -1) 762 fatal("failed to initialize configuration"); 763 764 if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL) 765 fatal("unknown user %s", VMD_USER); 766 767 /* First proc runs as root without pledge but in default chroot */ 768 proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */ 769 proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */ 770 771 /* Open /dev/vmm */ 772 if (env->vmd_noaction == 0) { 773 env->vmd_fd = open(VMM_NODE, O_RDWR); 774 if (env->vmd_fd == -1) 775 fatal("%s", VMM_NODE); 776 } 777 778 /* Configure the control socket */ 779 ps->ps_csock.cs_name = SOCKET_NAME; 780 TAILQ_INIT(&ps->ps_rcsocks); 781 782 /* Configuration will be parsed after forking the children */ 783 env->vmd_conffile = conffile; 784 785 log_init(env->vmd_debug, LOG_DAEMON); 786 log_setverbose(env->vmd_verbose); 787 788 if (env->vmd_noaction) 789 ps->ps_noaction = 1; 790 ps->ps_instance = proc_instance; 791 if (title != NULL) 792 ps->ps_title[proc_id] = title; 793 794 /* only the parent returns */ 795 proc_init(ps, procs, nitems(procs), argc0, argv, proc_id); 796 797 log_procinit("parent"); 798 if (!env->vmd_debug && daemon(0, 0) == -1) 799 fatal("can't daemonize"); 800 801 if (ps->ps_noaction == 0) 802 log_info("startup"); 803 804 event_init(); 805 806 signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps); 807 signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps); 808 signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps); 809 signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps); 810 signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps); 811 812 signal_add(&ps->ps_evsigint, NULL); 813 signal_add(&ps->ps_evsigterm, NULL); 814 signal_add(&ps->ps_evsighup, NULL); 815 signal_add(&ps->ps_evsigpipe, NULL); 816 signal_add(&ps->ps_evsigusr1, NULL); 817 818 if (!env->vmd_noaction) 819 proc_connect(ps); 820 821 if (vmd_configure() == -1) 822 fatalx("configuration failed"); 823 824 event_dispatch(); 825 826 log_debug("parent exiting"); 827 828 return (0); 829} 830 831int 832vmd_configure(void) 833{ 834 struct vmd_vm *vm; 835 struct vmd_switch *vsw; 836 837 if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1) 838 fatal("open %s", PATH_PTMDEV); 839 840 /* 841 * pledge in the parent process: 842 * stdio - for malloc and basic I/O including events. 843 * rpath - for reload to open and read the configuration files. 844 * wpath - for opening disk images and tap devices. 845 * tty - for openpty and TIOCUCNTL. 846 * proc - run kill to terminate its children safely. 847 * sendfd - for disks, interfaces and other fds. 848 * recvfd - for send and receive. 849 * getpw - lookup user or group id by name. 850 * chown, fattr - change tty ownership 851 * flock - locking disk files 852 */ 853 if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw" 854 " chown fattr flock", NULL) == -1) 855 fatal("pledge"); 856 857 if (parse_config(env->vmd_conffile) == -1) { 858 proc_kill(&env->vmd_ps); 859 exit(1); 860 } 861 862 if (env->vmd_noaction) { 863 fprintf(stderr, "configuration OK\n"); 864 proc_kill(&env->vmd_ps); 865 exit(0); 866 } 867 868 /* Send shared global configuration to all children */ 869 if (config_setconfig(env) == -1) 870 return (-1); 871 872 TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) { 873 if (vsw->sw_running) 874 continue; 875 if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) { 876 log_warn("%s: failed to create switch %s", 877 __func__, vsw->sw_name); 878 switch_remove(vsw); 879 return (-1); 880 } 881 } 882 883 TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { 884 if (vm->vm_disabled) { 885 log_debug("%s: not creating vm %s (disabled)", 886 __func__, 887 vm->vm_params.vmc_params.vcp_name); 888 continue; 889 } 890 if (config_setvm(&env->vmd_ps, vm, 891 -1, vm->vm_params.vmc_owner.uid) == -1) 892 return (-1); 893 } 894 895 return (0); 896} 897 898int 899vmd_reload(unsigned int reset, const char *filename) 900{ 901 struct vmd_vm *vm, *next_vm; 902 struct vmd_switch *vsw; 903 int reload = 0; 904 905 /* Switch back to the default config file */ 906 if (filename == NULL || *filename == '\0') { 907 filename = env->vmd_conffile; 908 reload = 1; 909 } 910 911 log_debug("%s: level %d config file %s", __func__, reset, filename); 912 913 if (reset) { 914 /* Purge the configuration */ 915 config_purge(env, reset); 916 config_setreset(env, reset); 917 } else { 918 /* 919 * Load or reload the configuration. 920 * 921 * Reloading removes all non-running VMs before processing the 922 * config file, whereas loading only adds to the existing list 923 * of VMs. 924 */ 925 926 if (reload) { 927 TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, 928 next_vm) { 929 if (vm->vm_running == 0) { 930 DPRINTF("%s: calling vm_remove", 931 __func__); 932 vm_remove(vm, __func__); 933 } 934 } 935 } 936 937 if (parse_config(filename) == -1) { 938 log_debug("%s: failed to load config file %s", 939 __func__, filename); 940 return (-1); 941 } 942 943 if (reload) { 944 /* Update shared global configuration in all children */ 945 if (config_setconfig(env) == -1) 946 return (-1); 947 } 948 949 TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) { 950 if (vsw->sw_running) 951 continue; 952 if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) { 953 log_warn("%s: failed to create switch %s", 954 __func__, vsw->sw_name); 955 switch_remove(vsw); 956 return (-1); 957 } 958 } 959 960 TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { 961 if (vm->vm_running == 0) { 962 if (vm->vm_disabled) { 963 log_debug("%s: not creating vm %s" 964 " (disabled)", __func__, 965 vm->vm_params.vmc_params.vcp_name); 966 continue; 967 } 968 if (config_setvm(&env->vmd_ps, vm, 969 -1, vm->vm_params.vmc_owner.uid) == -1) 970 return (-1); 971 } else { 972 log_debug("%s: not creating vm \"%s\": " 973 "(running)", __func__, 974 vm->vm_params.vmc_params.vcp_name); 975 } 976 } 977 } 978 979 return (0); 980} 981 982void 983vmd_shutdown(void) 984{ 985 struct vmd_vm *vm, *vm_next; 986 987 log_debug("%s: performing shutdown", __func__); 988 989 TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) { 990 vm_remove(vm, __func__); 991 } 992 993 proc_kill(&env->vmd_ps); 994 free(env); 995 996 log_warnx("parent terminating"); 997 exit(0); 998} 999 1000struct vmd_vm * 1001vm_getbyvmid(uint32_t vmid) 1002{ 1003 struct vmd_vm *vm; 1004 1005 if (vmid == 0) 1006 return (NULL); 1007 TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { 1008 if (vm->vm_vmid == vmid) 1009 return (vm); 1010 } 1011 1012 return (NULL); 1013} 1014 1015struct vmd_vm * 1016vm_getbyid(uint32_t id) 1017{ 1018 struct vmd_vm *vm; 1019 1020 if (id == 0) 1021 return (NULL); 1022 TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { 1023 if (vm->vm_params.vmc_params.vcp_id == id) 1024 return (vm); 1025 } 1026 1027 return (NULL); 1028} 1029 1030uint32_t 1031vm_id2vmid(uint32_t id, struct vmd_vm *vm) 1032{ 1033 if (vm == NULL && (vm = vm_getbyid(id)) == NULL) 1034 return (0); 1035 DPRINTF("%s: vmm id %u is vmid %u", __func__, 1036 id, vm->vm_vmid); 1037 return (vm->vm_vmid); 1038} 1039 1040uint32_t 1041vm_vmid2id(uint32_t vmid, struct vmd_vm *vm) 1042{ 1043 if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL) 1044 return (0); 1045 DPRINTF("%s: vmid %u is vmm id %u", __func__, 1046 vmid, vm->vm_params.vmc_params.vcp_id); 1047 return (vm->vm_params.vmc_params.vcp_id); 1048} 1049 1050struct vmd_vm * 1051vm_getbyname(const char *name) 1052{ 1053 struct vmd_vm *vm; 1054 1055 if (name == NULL) 1056 return (NULL); 1057 TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { 1058 if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0) 1059 return (vm); 1060 } 1061 1062 return (NULL); 1063} 1064 1065struct vmd_vm * 1066vm_getbypid(pid_t pid) 1067{ 1068 struct vmd_vm *vm; 1069 1070 TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { 1071 if (vm->vm_pid == pid) 1072 return (vm); 1073 } 1074 1075 return (NULL); 1076} 1077 1078void 1079vm_stop(struct vmd_vm *vm, int keeptty, const char *caller) 1080{ 1081 struct privsep *ps = &env->vmd_ps; 1082 unsigned int i; 1083 1084 if (vm == NULL) 1085 return; 1086 1087 log_debug("%s: %s %s stopping vm %d%s", 1088 __func__, ps->ps_title[privsep_process], caller, 1089 vm->vm_vmid, keeptty ? ", keeping tty open" : ""); 1090 1091 vm->vm_running = 0; 1092 vm->vm_shutdown = 0; 1093 1094 user_inc(&vm->vm_params.vmc_params, vm->vm_user, 0); 1095 user_put(vm->vm_user); 1096 1097 if (vm->vm_iev.ibuf.fd != -1) { 1098 event_del(&vm->vm_iev.ev); 1099 close(vm->vm_iev.ibuf.fd); 1100 } 1101 for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) { 1102 if (vm->vm_disks[i] != -1) { 1103 close(vm->vm_disks[i]); 1104 vm->vm_disks[i] = -1; 1105 } 1106 } 1107 for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) { 1108 if (vm->vm_ifs[i].vif_fd != -1) { 1109 close(vm->vm_ifs[i].vif_fd); 1110 vm->vm_ifs[i].vif_fd = -1; 1111 } 1112 free(vm->vm_ifs[i].vif_name); 1113 free(vm->vm_ifs[i].vif_switch); 1114 free(vm->vm_ifs[i].vif_group); 1115 vm->vm_ifs[i].vif_name = NULL; 1116 vm->vm_ifs[i].vif_switch = NULL; 1117 vm->vm_ifs[i].vif_group = NULL; 1118 } 1119 if (vm->vm_kernel != -1) { 1120 close(vm->vm_kernel); 1121 vm->vm_kernel = -1; 1122 } 1123 if (vm->vm_cdrom != -1) { 1124 close(vm->vm_cdrom); 1125 vm->vm_cdrom = -1; 1126 } 1127 if (!keeptty) { 1128 vm_closetty(vm); 1129 vm->vm_uid = 0; 1130 } 1131} 1132 1133void 1134vm_remove(struct vmd_vm *vm, const char *caller) 1135{ 1136 struct privsep *ps = &env->vmd_ps; 1137 1138 if (vm == NULL) 1139 return; 1140 1141 log_debug("%s: %s %s removing vm %d from running config", 1142 __func__, ps->ps_title[privsep_process], caller, 1143 vm->vm_vmid); 1144 1145 TAILQ_REMOVE(env->vmd_vms, vm, vm_entry); 1146 1147 user_put(vm->vm_user); 1148 vm_stop(vm, 0, caller); 1149 free(vm); 1150} 1151 1152int 1153vm_register(struct privsep *ps, struct vmop_create_params *vmc, 1154 struct vmd_vm **ret_vm, uint32_t id, uid_t uid) 1155{ 1156 struct vmd_vm *vm = NULL, *vm_parent = NULL; 1157 struct vm_create_params *vcp = &vmc->vmc_params; 1158 struct vmop_owner *vmo = NULL; 1159 struct vmd_user *usr = NULL; 1160 uint32_t rng; 1161 unsigned int i; 1162 struct vmd_switch *sw; 1163 char *s; 1164 1165 /* Check if this is an instance of another VM */ 1166 if (vm_instance(ps, &vm_parent, vmc, uid) == -1) 1167 return (-1); 1168 1169 errno = 0; 1170 *ret_vm = NULL; 1171 1172 if ((vm = vm_getbyname(vcp->vcp_name)) != NULL || 1173 (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) { 1174 if (vm_checkperm(vm, &vm->vm_params.vmc_owner, 1175 uid) != 0) { 1176 errno = EPERM; 1177 goto fail; 1178 } 1179 *ret_vm = vm; 1180 errno = EALREADY; 1181 goto fail; 1182 } 1183 1184 if (vm_parent != NULL) 1185 vmo = &vm_parent->vm_params.vmc_insowner; 1186 1187 /* non-root users can only start existing VMs or instances */ 1188 if (vm_checkperm(NULL, vmo, uid) != 0) { 1189 log_warnx("permission denied"); 1190 errno = EPERM; 1191 goto fail; 1192 } 1193 if (vmc->vmc_flags == 0) { 1194 log_warnx("invalid configuration, no devices"); 1195 errno = VMD_DISK_MISSING; 1196 goto fail; 1197 } 1198 if (vcp->vcp_ncpus == 0) 1199 vcp->vcp_ncpus = 1; 1200 if (vcp->vcp_memranges[0].vmr_size == 0) 1201 vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY; 1202 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) { 1203 log_warnx("invalid number of CPUs"); 1204 goto fail; 1205 } else if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) { 1206 log_warnx("invalid number of disks"); 1207 goto fail; 1208 } else if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) { 1209 log_warnx("invalid number of interfaces"); 1210 goto fail; 1211 } else if (strlen(vcp->vcp_kernel) == 0 && 1212 vcp->vcp_ndisks == 0 && strlen(vcp->vcp_cdrom) == 0) { 1213 log_warnx("no kernel or disk/cdrom specified"); 1214 goto fail; 1215 } else if (strlen(vcp->vcp_name) == 0) { 1216 log_warnx("invalid VM name"); 1217 goto fail; 1218 } else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' || 1219 *vcp->vcp_name == '_') { 1220 log_warnx("invalid VM name"); 1221 goto fail; 1222 } else { 1223 for (s = vcp->vcp_name; *s != '\0'; ++s) { 1224 if (!(isalnum(*s) || *s == '.' || *s == '-' || 1225 *s == '_')) { 1226 log_warnx("invalid VM name"); 1227 goto fail; 1228 } 1229 } 1230 } 1231 1232 /* track active users */ 1233 if (uid != 0 && env->vmd_users != NULL && 1234 (usr = user_get(uid)) == NULL) { 1235 log_warnx("could not add user"); 1236 goto fail; 1237 } 1238 1239 if ((vm = calloc(1, sizeof(*vm))) == NULL) 1240 goto fail; 1241 1242 memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params)); 1243 vmc = &vm->vm_params; 1244 vcp = &vmc->vmc_params; 1245 vm->vm_pid = -1; 1246 vm->vm_tty = -1; 1247 vm->vm_receive_fd = -1; 1248 vm->vm_paused = 0; 1249 vm->vm_user = usr; 1250 1251 for (i = 0; i < vcp->vcp_ndisks; i++) 1252 vm->vm_disks[i] = -1; 1253 for (i = 0; i < vcp->vcp_nnics; i++) { 1254 vm->vm_ifs[i].vif_fd = -1; 1255 1256 if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) { 1257 /* inherit per-interface flags from the switch */ 1258 vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK); 1259 } 1260 1261 /* 1262 * If the MAC address is zero, always randomize it in vmd(8) 1263 * because we cannot rely on the guest OS to do the right 1264 * thing like OpenBSD does. Based on ether_fakeaddr() 1265 * from the kernel, incremented by one to differentiate 1266 * the source. 1267 */ 1268 if (memcmp(zero_mac, &vcp->vcp_macs[i], ETHER_ADDR_LEN) == 0) { 1269 rng = arc4random(); 1270 vcp->vcp_macs[i][0] = 0xfe; 1271 vcp->vcp_macs[i][1] = 0xe1; 1272 vcp->vcp_macs[i][2] = 0xba + 1; 1273 vcp->vcp_macs[i][3] = 0xd0 | ((i + 1) & 0xf); 1274 vcp->vcp_macs[i][4] = rng; 1275 vcp->vcp_macs[i][5] = rng >> 8; 1276 } 1277 } 1278 vm->vm_kernel = -1; 1279 vm->vm_cdrom = -1; 1280 vm->vm_iev.ibuf.fd = -1; 1281 1282 if (++env->vmd_nvm == 0) 1283 fatalx("too many vms"); 1284 1285 /* Assign a new internal Id if not specified */ 1286 vm->vm_vmid = id == 0 ? env->vmd_nvm : id; 1287 1288 log_debug("%s: registering vm %d", __func__, vm->vm_vmid); 1289 TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry); 1290 1291 *ret_vm = vm; 1292 return (0); 1293 fail: 1294 if (errno == 0) 1295 errno = EINVAL; 1296 return (-1); 1297} 1298 1299int 1300vm_instance(struct privsep *ps, struct vmd_vm **vm_parent, 1301 struct vmop_create_params *vmc, uid_t uid) 1302{ 1303 char *name; 1304 struct vm_create_params *vcp = &vmc->vmc_params; 1305 struct vmop_create_params *vmcp; 1306 struct vm_create_params *vcpp; 1307 struct vmd_vm *vm = NULL; 1308 unsigned int i, j; 1309 uint32_t id; 1310 1311 /* return without error if the parent is NULL (nothing to inherit) */ 1312 if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 || 1313 (*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL) 1314 return (0); 1315 1316 errno = 0; 1317 vmcp = &(*vm_parent)->vm_params; 1318 vcpp = &vmcp->vmc_params; 1319 1320 /* Are we allowed to create an instance from this VM? */ 1321 if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) { 1322 log_warnx("vm \"%s\" no permission to create vm instance", 1323 vcpp->vcp_name); 1324 errno = ENAMETOOLONG; 1325 return (-1); 1326 } 1327 1328 id = vcp->vcp_id; 1329 name = vcp->vcp_name; 1330 1331 if ((vm = vm_getbyname(vcp->vcp_name)) != NULL || 1332 (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) { 1333 errno = EPROCLIM; 1334 return (-1); 1335 } 1336 1337 /* CPU */ 1338 if (vcp->vcp_ncpus == 0) 1339 vcp->vcp_ncpus = vcpp->vcp_ncpus; 1340 if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 && 1341 vcp->vcp_ncpus != vcpp->vcp_ncpus) { 1342 log_warnx("vm \"%s\" no permission to set cpus", name); 1343 errno = EPERM; 1344 return (-1); 1345 } 1346 1347 /* memory */ 1348 if (vcp->vcp_memranges[0].vmr_size == 0) 1349 vcp->vcp_memranges[0].vmr_size = 1350 vcpp->vcp_memranges[0].vmr_size; 1351 if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 && 1352 vcp->vcp_memranges[0].vmr_size != 1353 vcpp->vcp_memranges[0].vmr_size) { 1354 log_warnx("vm \"%s\" no permission to set memory", name); 1355 errno = EPERM; 1356 return (-1); 1357 } 1358 1359 /* disks cannot be inherited */ 1360 if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 && 1361 vcp->vcp_ndisks) { 1362 log_warnx("vm \"%s\" no permission to set disks", name); 1363 errno = EPERM; 1364 return (-1); 1365 } 1366 for (i = 0; i < vcp->vcp_ndisks; i++) { 1367 /* Check if this disk is already used in the parent */ 1368 for (j = 0; j < vcpp->vcp_ndisks; j++) { 1369 if (strcmp(vcp->vcp_disks[i], 1370 vcpp->vcp_disks[j]) == 0) { 1371 log_warnx("vm \"%s\" disk %s cannot be reused", 1372 name, vcp->vcp_disks[i]); 1373 errno = EBUSY; 1374 return (-1); 1375 } 1376 } 1377 vmc->vmc_checkaccess |= VMOP_CREATE_DISK; 1378 } 1379 1380 /* interfaces */ 1381 if (vcp->vcp_nnics > 0 && 1382 vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 && 1383 vcp->vcp_nnics != vcpp->vcp_nnics) { 1384 log_warnx("vm \"%s\" no permission to set interfaces", name); 1385 errno = EPERM; 1386 return (-1); 1387 } 1388 for (i = 0; i < vcpp->vcp_nnics; i++) { 1389 /* Interface got overwritten */ 1390 if (i < vcp->vcp_nnics) 1391 continue; 1392 1393 /* Copy interface from parent */ 1394 vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i]; 1395 (void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i], 1396 sizeof(vmc->vmc_ifnames[i])); 1397 (void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i], 1398 sizeof(vmc->vmc_ifswitch[i])); 1399 (void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i], 1400 sizeof(vmc->vmc_ifgroup[i])); 1401 memcpy(vcp->vcp_macs[i], vcpp->vcp_macs[i], 1402 sizeof(vcp->vcp_macs[i])); 1403 vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i]; 1404 vcp->vcp_nnics++; 1405 } 1406 for (i = 0; i < vcp->vcp_nnics; i++) { 1407 for (j = 0; j < vcpp->vcp_nnics; j++) { 1408 if (memcmp(zero_mac, vcp->vcp_macs[i], 1409 sizeof(vcp->vcp_macs[i])) != 0 && 1410 memcmp(vcpp->vcp_macs[i], vcp->vcp_macs[i], 1411 sizeof(vcp->vcp_macs[i])) != 0) { 1412 log_warnx("vm \"%s\" lladdr cannot be reused", 1413 name); 1414 errno = EBUSY; 1415 return (-1); 1416 } 1417 if (strlen(vmc->vmc_ifnames[i]) && 1418 strcmp(vmc->vmc_ifnames[i], 1419 vmcp->vmc_ifnames[j]) == 0) { 1420 log_warnx("vm \"%s\" %s cannot be reused", 1421 vmc->vmc_ifnames[i], name); 1422 errno = EBUSY; 1423 return (-1); 1424 } 1425 } 1426 } 1427 1428 /* kernel */ 1429 if (strlen(vcp->vcp_kernel) > 0) { 1430 if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) { 1431 log_warnx("vm \"%s\" no permission to set boot image", 1432 name); 1433 errno = EPERM; 1434 return (-1); 1435 } 1436 vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL; 1437 } else if (strlcpy(vcp->vcp_kernel, vcpp->vcp_kernel, 1438 sizeof(vcp->vcp_kernel)) >= sizeof(vcp->vcp_kernel)) { 1439 log_warnx("vm \"%s\" kernel name too long", name); 1440 errno = EINVAL; 1441 return (-1); 1442 } 1443 1444 /* cdrom */ 1445 if (strlen(vcp->vcp_cdrom) > 0) { 1446 if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) { 1447 log_warnx("vm \"%s\" no permission to set cdrom", name); 1448 errno = EPERM; 1449 return (-1); 1450 } 1451 vmc->vmc_checkaccess |= VMOP_CREATE_CDROM; 1452 } else if (strlcpy(vcp->vcp_cdrom, vcpp->vcp_cdrom, 1453 sizeof(vcp->vcp_cdrom)) >= sizeof(vcp->vcp_cdrom)) { 1454 log_warnx("vm \"%s\" cdrom name too long", name); 1455 errno = EINVAL; 1456 return (-1); 1457 } 1458 1459 /* user */ 1460 if (vmc->vmc_owner.uid == 0) 1461 vmc->vmc_owner.uid = vmcp->vmc_owner.uid; 1462 else if (vmc->vmc_owner.uid != uid && 1463 vmc->vmc_owner.uid != vmcp->vmc_owner.uid) { 1464 log_warnx("vm \"%s\" user mismatch", name); 1465 errno = EPERM; 1466 return (-1); 1467 } 1468 1469 /* group */ 1470 if (vmc->vmc_owner.gid == 0) 1471 vmc->vmc_owner.gid = vmcp->vmc_owner.gid; 1472 else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) { 1473 log_warnx("vm \"%s\" group mismatch", name); 1474 errno = EPERM; 1475 return (-1); 1476 } 1477 1478 /* child instances */ 1479 if (vmc->vmc_insflags) { 1480 log_warnx("vm \"%s\" cannot change instance permissions", name); 1481 errno = EPERM; 1482 return (-1); 1483 } 1484 if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) { 1485 vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid; 1486 vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid; 1487 vmc->vmc_insflags = vmcp->vmc_insflags; 1488 } else { 1489 vmc->vmc_insowner.gid = 0; 1490 vmc->vmc_insowner.uid = 0; 1491 vmc->vmc_insflags = 0; 1492 } 1493 1494 /* finished, remove instance flags */ 1495 vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE; 1496 1497 return (0); 1498} 1499 1500/* 1501 * vm_checkperm 1502 * 1503 * Checks if the user represented by the 'uid' parameter is allowed to 1504 * manipulate the VM described by the 'vm' parameter (or connect to said VM's 1505 * console.) 1506 * 1507 * Parameters: 1508 * vm: the VM whose permission is to be checked 1509 * vmo: the required uid/gid to be checked 1510 * uid: the user ID of the user making the request 1511 * 1512 * Return values: 1513 * 0: the permission should be granted 1514 * -1: the permission check failed (also returned if vm == null) 1515 */ 1516int 1517vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid) 1518{ 1519 struct group *gr; 1520 struct passwd *pw; 1521 char **grmem; 1522 1523 /* root has no restrictions */ 1524 if (uid == 0) 1525 return (0); 1526 1527 if (vmo == NULL) 1528 return (-1); 1529 1530 /* check user */ 1531 if (vm == NULL) { 1532 if (vmo->uid == uid) 1533 return (0); 1534 } else { 1535 /* 1536 * check user of running vm (the owner of a running vm can 1537 * be different to (or more specific than) the configured owner. 1538 */ 1539 if ((vm->vm_running && vm->vm_uid == uid) || 1540 (!vm->vm_running && vmo->uid == uid)) 1541 return (0); 1542 } 1543 1544 /* check groups */ 1545 if (vmo->gid != -1) { 1546 if ((pw = getpwuid(uid)) == NULL) 1547 return (-1); 1548 if (pw->pw_gid == vmo->gid) 1549 return (0); 1550 if ((gr = getgrgid(vmo->gid)) != NULL) { 1551 for (grmem = gr->gr_mem; *grmem; grmem++) 1552 if (strcmp(*grmem, pw->pw_name) == 0) 1553 return (0); 1554 } 1555 } 1556 1557 return (-1); 1558} 1559 1560/* 1561 * vm_checkinsflag 1562 * 1563 * Checks wheter the non-root user is allowed to set an instance option. 1564 * 1565 * Parameters: 1566 * vmc: the VM create parameters 1567 * flag: the flag to be checked 1568 * uid: the user ID of the user making the request 1569 * 1570 * Return values: 1571 * 0: the permission should be granted 1572 * -1: the permission check failed (also returned if vm == null) 1573 */ 1574int 1575vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid) 1576{ 1577 /* root has no restrictions */ 1578 if (uid == 0) 1579 return (0); 1580 1581 if ((vmc->vmc_insflags & flag) == 0) 1582 return (-1); 1583 1584 return (0); 1585} 1586 1587/* 1588 * vm_checkaccess 1589 * 1590 * Checks if the user represented by the 'uid' parameter is allowed to 1591 * access the file described by the 'path' parameter. 1592 * 1593 * Parameters: 1594 * fd: the file descriptor of the opened file 1595 * uflag: check if the userid has access to the file 1596 * uid: the user ID of the user making the request 1597 * amode: the access flags of R_OK and W_OK 1598 * 1599 * Return values: 1600 * 0: the permission should be granted 1601 * -1: the permission check failed 1602 */ 1603int 1604vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode) 1605{ 1606 struct group *gr; 1607 struct passwd *pw; 1608 char **grmem; 1609 struct stat st; 1610 mode_t mode; 1611 1612 if (fd == -1) 1613 return (-1); 1614 1615 /* 1616 * File has to be accessible and a regular file 1617 */ 1618 if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode)) 1619 return (-1); 1620 1621 /* root has no restrictions */ 1622 if (uid == 0 || uflag == 0) 1623 return (0); 1624 1625 /* check other */ 1626 mode = amode & W_OK ? S_IWOTH : 0; 1627 mode |= amode & R_OK ? S_IROTH : 0; 1628 if ((st.st_mode & mode) == mode) 1629 return (0); 1630 1631 /* check user */ 1632 mode = amode & W_OK ? S_IWUSR : 0; 1633 mode |= amode & R_OK ? S_IRUSR : 0; 1634 if (uid == st.st_uid && (st.st_mode & mode) == mode) 1635 return (0); 1636 1637 /* check groups */ 1638 mode = amode & W_OK ? S_IWGRP : 0; 1639 mode |= amode & R_OK ? S_IRGRP : 0; 1640 if ((st.st_mode & mode) != mode) 1641 return (-1); 1642 if ((pw = getpwuid(uid)) == NULL) 1643 return (-1); 1644 if (pw->pw_gid == st.st_gid) 1645 return (0); 1646 if ((gr = getgrgid(st.st_gid)) != NULL) { 1647 for (grmem = gr->gr_mem; *grmem; grmem++) 1648 if (strcmp(*grmem, pw->pw_name) == 0) 1649 return (0); 1650 } 1651 1652 return (-1); 1653} 1654 1655int 1656vm_opentty(struct vmd_vm *vm) 1657{ 1658 struct ptmget ptm; 1659 struct stat st; 1660 struct group *gr; 1661 uid_t uid; 1662 gid_t gid; 1663 mode_t mode; 1664 int on; 1665 1666 /* 1667 * Open tty with pre-opened PTM fd 1668 */ 1669 if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1)) 1670 return (-1); 1671 1672 /* 1673 * We use user ioctl(2) mode to pass break commands. 1674 */ 1675 on = 1; 1676 if (ioctl(ptm.cfd, TIOCUCNTL, &on)) 1677 fatal("could not enable user ioctl mode"); 1678 1679 vm->vm_tty = ptm.cfd; 1680 close(ptm.sfd); 1681 if ((vm->vm_ttyname = strdup(ptm.sn)) == NULL) 1682 goto fail; 1683 1684 uid = vm->vm_uid; 1685 gid = vm->vm_params.vmc_owner.gid; 1686 1687 if (vm->vm_params.vmc_owner.gid != -1) { 1688 mode = 0660; 1689 } else if ((gr = getgrnam("tty")) != NULL) { 1690 gid = gr->gr_gid; 1691 mode = 0620; 1692 } else { 1693 mode = 0600; 1694 gid = 0; 1695 } 1696 1697 log_debug("%s: vm %s tty %s uid %d gid %d mode %o", 1698 __func__, vm->vm_params.vmc_params.vcp_name, 1699 vm->vm_ttyname, uid, gid, mode); 1700 1701 /* 1702 * Change ownership and mode of the tty as required. 1703 * Loosely based on the implementation of sshpty.c 1704 */ 1705 if (stat(vm->vm_ttyname, &st) == -1) 1706 goto fail; 1707 1708 if (st.st_uid != uid || st.st_gid != gid) { 1709 if (chown(vm->vm_ttyname, uid, gid) == -1) { 1710 log_warn("chown %s %d %d failed, uid %d", 1711 vm->vm_ttyname, uid, gid, getuid()); 1712 1713 /* Ignore failure on read-only filesystems */ 1714 if (!((errno == EROFS) && 1715 (st.st_uid == uid || st.st_uid == 0))) 1716 goto fail; 1717 } 1718 } 1719 1720 if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) { 1721 if (chmod(vm->vm_ttyname, mode) == -1) { 1722 log_warn("chmod %s %o failed, uid %d", 1723 vm->vm_ttyname, mode, getuid()); 1724 1725 /* Ignore failure on read-only filesystems */ 1726 if (!((errno == EROFS) && 1727 (st.st_uid == uid || st.st_uid == 0))) 1728 goto fail; 1729 } 1730 } 1731 1732 return (0); 1733 fail: 1734 vm_closetty(vm); 1735 return (-1); 1736} 1737 1738void 1739vm_closetty(struct vmd_vm *vm) 1740{ 1741 if (vm->vm_tty != -1) { 1742 /* Release and close the tty */ 1743 if (fchown(vm->vm_tty, 0, 0) == -1) 1744 log_warn("chown %s 0 0 failed", vm->vm_ttyname); 1745 if (fchmod(vm->vm_tty, 0666) == -1) 1746 log_warn("chmod %s 0666 failed", vm->vm_ttyname); 1747 close(vm->vm_tty); 1748 vm->vm_tty = -1; 1749 } 1750 free(vm->vm_ttyname); 1751 vm->vm_ttyname = NULL; 1752} 1753 1754void 1755switch_remove(struct vmd_switch *vsw) 1756{ 1757 if (vsw == NULL) 1758 return; 1759 1760 TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry); 1761 1762 free(vsw->sw_group); 1763 free(vsw->sw_name); 1764 free(vsw); 1765} 1766 1767struct vmd_switch * 1768switch_getbyname(const char *name) 1769{ 1770 struct vmd_switch *vsw; 1771 1772 if (name == NULL) 1773 return (NULL); 1774 TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) { 1775 if (strcmp(vsw->sw_name, name) == 0) 1776 return (vsw); 1777 } 1778 1779 return (NULL); 1780} 1781 1782struct vmd_user * 1783user_get(uid_t uid) 1784{ 1785 struct vmd_user *usr; 1786 1787 if (uid == 0) 1788 return (NULL); 1789 1790 /* first try to find an existing user */ 1791 TAILQ_FOREACH(usr, env->vmd_users, usr_entry) { 1792 if (usr->usr_id.uid == uid) 1793 goto done; 1794 } 1795 1796 if ((usr = calloc(1, sizeof(*usr))) == NULL) { 1797 log_warn("could not allocate user"); 1798 return (NULL); 1799 } 1800 1801 usr->usr_id.uid = uid; 1802 usr->usr_id.gid = -1; 1803 TAILQ_INSERT_TAIL(env->vmd_users, usr, usr_entry); 1804 1805 done: 1806 DPRINTF("%s: uid %d #%d +", 1807 __func__, usr->usr_id.uid, usr->usr_refcnt + 1); 1808 usr->usr_refcnt++; 1809 1810 return (usr); 1811} 1812 1813void 1814user_put(struct vmd_user *usr) 1815{ 1816 if (usr == NULL) 1817 return; 1818 1819 DPRINTF("%s: uid %d #%d -", 1820 __func__, usr->usr_id.uid, usr->usr_refcnt - 1); 1821 1822 if (--usr->usr_refcnt > 0) 1823 return; 1824 1825 TAILQ_REMOVE(env->vmd_users, usr, usr_entry); 1826 free(usr); 1827} 1828 1829void 1830user_inc(struct vm_create_params *vcp, struct vmd_user *usr, int inc) 1831{ 1832 char mem[FMT_SCALED_STRSIZE]; 1833 1834 if (usr == NULL) 1835 return; 1836 1837 /* increment or decrement counters */ 1838 inc = inc ? 1 : -1; 1839 1840 usr->usr_maxcpu += vcp->vcp_ncpus * inc; 1841 usr->usr_maxmem += vcp->vcp_memranges[0].vmr_size * inc; 1842 usr->usr_maxifs += vcp->vcp_nnics * inc; 1843 1844 if (log_getverbose() > 1) { 1845 (void)fmt_scaled(usr->usr_maxmem * 1024 * 1024, mem); 1846 log_debug("%s: %c uid %d ref %d cpu %llu mem %s ifs %llu", 1847 __func__, inc == 1 ? '+' : '-', 1848 usr->usr_id.uid, usr->usr_refcnt, 1849 usr->usr_maxcpu, mem, usr->usr_maxifs); 1850 } 1851} 1852 1853int 1854user_checklimit(struct vmd_user *usr, struct vm_create_params *vcp) 1855{ 1856 const char *limit = ""; 1857 1858 /* XXX make the limits configurable */ 1859 if (usr->usr_maxcpu > VM_DEFAULT_USER_MAXCPU) { 1860 limit = "cpu "; 1861 goto fail; 1862 } 1863 if (usr->usr_maxcpu > VM_DEFAULT_USER_MAXMEM) { 1864 limit = "memory "; 1865 goto fail; 1866 } 1867 if (usr->usr_maxifs > VM_DEFAULT_USER_MAXIFS) { 1868 limit = "interface "; 1869 goto fail; 1870 } 1871 1872 return (0); 1873 1874 fail: 1875 log_warnx("%s: user %d %slimit reached", vcp->vcp_name, 1876 usr->usr_id.uid, limit); 1877 return (-1); 1878} 1879 1880char * 1881get_string(uint8_t *ptr, size_t len) 1882{ 1883 size_t i; 1884 1885 for (i = 0; i < len; i++) 1886 if (!isprint(ptr[i])) 1887 break; 1888 1889 return strndup(ptr, i); 1890} 1891 1892uint32_t 1893prefixlen2mask(uint8_t prefixlen) 1894{ 1895 if (prefixlen == 0) 1896 return (0); 1897 1898 if (prefixlen > 32) 1899 prefixlen = 32; 1900 1901 return (htonl(0xffffffff << (32 - prefixlen))); 1902} 1903