1 /* $OpenBSD: config.c,v 1.76 2024/09/26 01:45:13 jsg Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/queue.h> 21 #include <sys/time.h> 22 23 #include <net/if.h> 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <unistd.h> 28 #include <limits.h> 29 #include <string.h> 30 #include <fcntl.h> 31 #include <errno.h> 32 #include <imsg.h> 33 34 #include "proc.h" 35 #include "vmd.h" 36 37 /* Supported bridge types */ 38 const char *vmd_descsw[] = { "bridge", "veb", NULL }; 39 40 static int config_init_localprefix(struct vmd_config *); 41 42 static int 43 config_init_localprefix(struct vmd_config *cfg) 44 { 45 if (parse_prefix4(VMD_DHCP_PREFIX, &cfg->cfg_localprefix, NULL) == -1) 46 return (-1); 47 48 /* IPv6 is disabled by default */ 49 cfg->cfg_flags &= ~VMD_CFG_INET6; 50 51 /* Generate random IPv6 prefix only once */ 52 if (cfg->cfg_flags & VMD_CFG_AUTOINET6) 53 return (0); 54 if (parse_prefix6(VMD_ULA_PREFIX, &cfg->cfg_localprefix, NULL) == -1) 55 return (-1); 56 57 /* Randomize the 56 bits "Global ID" and "Subnet ID" */ 58 arc4random_buf(&cfg->cfg_localprefix.lp_in6.s6_addr[1], 7); 59 cfg->cfg_flags |= VMD_CFG_AUTOINET6; 60 61 return (0); 62 } 63 64 int 65 config_init(struct vmd *env) 66 { 67 struct privsep *ps = &env->vmd_ps; 68 unsigned int what; 69 70 /* Global configuration */ 71 ps->ps_what[PROC_PARENT] = CONFIG_ALL; 72 ps->ps_what[PROC_VMM] = CONFIG_VMS; 73 74 /* Local prefix */ 75 if (config_init_localprefix(&env->vmd_cfg) == -1) 76 return (-1); 77 78 /* Other configuration */ 79 what = ps->ps_what[privsep_process]; 80 if (what & CONFIG_VMS) { 81 if ((env->vmd_vms = calloc(1, sizeof(*env->vmd_vms))) == NULL) 82 return (-1); 83 if ((env->vmd_known = calloc(1, sizeof(*env->vmd_known))) == NULL) 84 return (-1); 85 TAILQ_INIT(env->vmd_vms); 86 TAILQ_INIT(env->vmd_known); 87 } 88 if (what & CONFIG_SWITCHES) { 89 if ((env->vmd_switches = calloc(1, 90 sizeof(*env->vmd_switches))) == NULL) 91 return (-1); 92 TAILQ_INIT(env->vmd_switches); 93 } 94 95 return (0); 96 } 97 98 void 99 config_purge(struct vmd *env, unsigned int reset) 100 { 101 struct privsep *ps = &env->vmd_ps; 102 struct name2id *n2i; 103 struct vmd_vm *vm; 104 struct vmd_switch *vsw; 105 unsigned int what; 106 107 DPRINTF("%s: %s purging vms and switches", 108 __func__, ps->ps_title[privsep_process]); 109 110 /* Reset global configuration (prefix was verified before) */ 111 config_init_localprefix(&env->vmd_cfg); 112 113 /* Reset other configuration */ 114 what = ps->ps_what[privsep_process] & reset; 115 if (what & CONFIG_VMS && env->vmd_vms != NULL) { 116 while ((vm = TAILQ_FIRST(env->vmd_vms)) != NULL) { 117 vm_remove(vm, __func__); 118 } 119 while ((n2i = TAILQ_FIRST(env->vmd_known)) != NULL) { 120 TAILQ_REMOVE(env->vmd_known, n2i, entry); 121 free(n2i); 122 } 123 env->vmd_nvm = 0; 124 } 125 if (what & CONFIG_SWITCHES && env->vmd_switches != NULL) { 126 while ((vsw = TAILQ_FIRST(env->vmd_switches)) != NULL) 127 switch_remove(vsw); 128 env->vmd_nswitches = 0; 129 } 130 } 131 132 int 133 config_setconfig(struct vmd *env) 134 { 135 struct privsep *ps = &env->vmd_ps; 136 unsigned int id; 137 138 DPRINTF("%s: setting config", __func__); 139 140 for (id = 0; id < PROC_MAX; id++) { 141 if (id == privsep_process) 142 continue; 143 proc_compose(ps, id, IMSG_VMDOP_CONFIG, &env->vmd_cfg, 144 sizeof(env->vmd_cfg)); 145 } 146 147 return (0); 148 } 149 150 int 151 config_getconfig(struct vmd *env, struct imsg *imsg) 152 { 153 struct privsep *ps = &env->vmd_ps; 154 155 log_debug("%s: %s retrieving config", 156 __func__, ps->ps_title[privsep_process]); 157 158 IMSG_SIZE_CHECK(imsg, &env->vmd_cfg); 159 memcpy(&env->vmd_cfg, imsg->data, sizeof(env->vmd_cfg)); 160 161 return (0); 162 } 163 164 int 165 config_setreset(struct vmd *env, unsigned int reset) 166 { 167 struct privsep *ps = &env->vmd_ps; 168 unsigned int id; 169 170 DPRINTF("%s: resetting state", __func__); 171 172 for (id = 0; id < PROC_MAX; id++) { 173 if ((reset & ps->ps_what[id]) == 0 || 174 id == privsep_process) 175 continue; 176 proc_compose(ps, id, IMSG_CTL_RESET, &reset, sizeof(reset)); 177 } 178 179 return (0); 180 } 181 182 int 183 config_getreset(struct vmd *env, struct imsg *imsg) 184 { 185 unsigned int mode; 186 187 IMSG_SIZE_CHECK(imsg, &mode); 188 memcpy(&mode, imsg->data, sizeof(mode)); 189 190 log_debug("%s: %s resetting state", 191 __func__, env->vmd_ps.ps_title[privsep_process]); 192 193 config_purge(env, mode); 194 195 return (0); 196 } 197 198 /* 199 * config_setvm 200 * 201 * Configure a vm, opening any required file descriptors. 202 * 203 * Returns 0 on success, error code on failure. 204 */ 205 int 206 config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid) 207 { 208 int diskfds[VM_MAX_DISKS_PER_VM][VM_MAX_BASE_PER_DISK]; 209 struct vmd_if *vif; 210 struct vmop_create_params *vmc = &vm->vm_params; 211 struct vm_create_params *vcp = &vmc->vmc_params; 212 unsigned int i, j; 213 int fd = -1, cdromfd = -1, kernfd = -1; 214 int *tapfds = NULL; 215 int n = 0, aflags, oflags, ret = -1; 216 char ifname[IF_NAMESIZE], *s; 217 char path[PATH_MAX], base[PATH_MAX]; 218 unsigned int unit; 219 struct timeval tv, rate, since_last; 220 struct vmop_addr_req var; 221 size_t bytes = 0; 222 223 if (vm->vm_state & VM_STATE_RUNNING) { 224 log_warnx("%s: vm is already running", __func__); 225 return (EALREADY); 226 } 227 228 /* 229 * Rate-limit the VM so that it cannot restart in a loop: 230 * if the VM restarts after less than VM_START_RATE_SEC seconds, 231 * we increment the limit counter. After VM_START_RATE_LIMIT 232 * of suchs fast reboots the VM is stopped. 233 */ 234 getmonotime(&tv); 235 if (vm->vm_start_tv.tv_sec) { 236 timersub(&tv, &vm->vm_start_tv, &since_last); 237 238 rate.tv_sec = VM_START_RATE_SEC; 239 rate.tv_usec = 0; 240 if (timercmp(&since_last, &rate, <)) 241 vm->vm_start_limit++; 242 else { 243 /* Reset counter */ 244 vm->vm_start_limit = 0; 245 } 246 247 log_debug("%s: vm %u restarted after %lld.%ld seconds," 248 " limit %d/%d", __func__, vcp->vcp_id, since_last.tv_sec, 249 since_last.tv_usec, vm->vm_start_limit, 250 VM_START_RATE_LIMIT); 251 252 if (vm->vm_start_limit >= VM_START_RATE_LIMIT) { 253 log_warnx("%s: vm %u restarted too quickly", 254 __func__, vcp->vcp_id); 255 return (EPERM); 256 } 257 } 258 vm->vm_start_tv = tv; 259 260 for (i = 0; i < VM_MAX_DISKS_PER_VM; i++) 261 for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) 262 diskfds[i][j] = -1; 263 264 tapfds = reallocarray(NULL, vmc->vmc_nnics, sizeof(*tapfds)); 265 if (tapfds == NULL) { 266 ret = errno; 267 log_warn("%s: can't allocate tap fds", __func__); 268 return (ret); 269 } 270 for (i = 0; i < vmc->vmc_nnics; i++) 271 tapfds[i] = -1; 272 273 vm->vm_peerid = peerid; 274 vm->vm_uid = uid; 275 276 /* 277 * From here onward, all failures need cleanup and use goto fail 278 */ 279 if (!(vm->vm_state & VM_STATE_RECEIVED) && vm->vm_kernel == -1) { 280 if (vm->vm_kernel_path != NULL) { 281 /* Open external kernel for child */ 282 kernfd = open(vm->vm_kernel_path, O_RDONLY | O_CLOEXEC); 283 if (kernfd == -1) { 284 ret = errno; 285 log_warn("%s: can't open kernel or BIOS " 286 "boot image %s", __func__, 287 vm->vm_kernel_path); 288 goto fail; 289 } 290 } 291 292 /* 293 * Try to open the default BIOS image if no kernel/BIOS has been 294 * specified. The BIOS is an external firmware file that is 295 * typically distributed separately due to an incompatible 296 * license. 297 */ 298 if (kernfd == -1) { 299 if ((kernfd = open(VM_DEFAULT_BIOS, 300 O_RDONLY | O_CLOEXEC)) == -1) { 301 log_warn("can't open %s", VM_DEFAULT_BIOS); 302 ret = VMD_BIOS_MISSING; 303 goto fail; 304 } 305 } 306 307 if (vm_checkaccess(kernfd, 308 vmc->vmc_checkaccess & VMOP_CREATE_KERNEL, 309 uid, R_OK) == -1) { 310 log_warnx("vm \"%s\" no read access to kernel " 311 "%s", vcp->vcp_name, vm->vm_kernel_path); 312 ret = EPERM; 313 goto fail; 314 } 315 316 vm->vm_kernel = kernfd; 317 vmc->vmc_kernel = kernfd; 318 } 319 320 /* Open CDROM image for child */ 321 if (strlen(vmc->vmc_cdrom)) { 322 /* Stat cdrom to ensure it is a regular file */ 323 if ((cdromfd = 324 open(vmc->vmc_cdrom, O_RDONLY)) == -1) { 325 log_warn("can't open cdrom %s", vmc->vmc_cdrom); 326 ret = VMD_CDROM_MISSING; 327 goto fail; 328 } 329 330 if (vm_checkaccess(cdromfd, 331 vmc->vmc_checkaccess & VMOP_CREATE_CDROM, 332 uid, R_OK) == -1) { 333 log_warnx("vm \"%s\" no read access to cdrom %s", 334 vcp->vcp_name, vmc->vmc_cdrom); 335 ret = EPERM; 336 goto fail; 337 } 338 } 339 340 /* 341 * Open disk images for child. Don't set O_CLOEXEC as these must be 342 * explicitly closed by the vm process during virtio subprocess launch. 343 */ 344 for (i = 0 ; i < vmc->vmc_ndisks; i++) { 345 if (strlcpy(path, vmc->vmc_disks[i], sizeof(path)) 346 >= sizeof(path)) 347 log_warnx("disk path %s too long", vmc->vmc_disks[i]); 348 memset(vmc->vmc_diskbases, 0, sizeof(vmc->vmc_diskbases)); 349 oflags = O_RDWR | O_EXLOCK | O_NONBLOCK; 350 aflags = R_OK | W_OK; 351 for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) { 352 /* Stat disk[i] to ensure it is a regular file */ 353 if ((diskfds[i][j] = open(path, oflags)) == -1) { 354 log_warn("can't open disk %s", 355 vmc->vmc_disks[i]); 356 ret = VMD_DISK_MISSING; 357 goto fail; 358 } 359 360 if (vm_checkaccess(diskfds[i][j], 361 vmc->vmc_checkaccess & VMOP_CREATE_DISK, 362 uid, aflags) == -1) { 363 log_warnx("vm \"%s\" unable to access " 364 "disk %s", vcp->vcp_name, path); 365 errno = EPERM; 366 goto fail; 367 } 368 369 /* 370 * Clear the write and exclusive flags for base images. 371 * All writes should go to the top image, allowing them 372 * to be shared. 373 */ 374 oflags = O_RDONLY | O_NONBLOCK; 375 aflags = R_OK; 376 n = virtio_get_base(diskfds[i][j], base, sizeof(base), 377 vmc->vmc_disktypes[i], path); 378 if (n == 0) 379 break; 380 if (n == -1) { 381 log_warnx("vm \"%s\" unable to read " 382 "base for disk %s", vcp->vcp_name, 383 vmc->vmc_disks[i]); 384 goto fail; 385 } 386 (void)strlcpy(path, base, sizeof(path)); 387 } 388 } 389 390 /* Open network interfaces */ 391 for (i = 0 ; i < vmc->vmc_nnics; i++) { 392 vif = &vm->vm_ifs[i]; 393 394 /* Check if the user has requested a specific tap(4) */ 395 s = vmc->vmc_ifnames[i]; 396 if (*s != '\0' && strcmp("tap", s) != 0) { 397 if (priv_getiftype(s, ifname, &unit) == -1 || 398 strcmp(ifname, "tap") != 0) { 399 log_warnx("%s: invalid tap name %s", 400 __func__, s); 401 ret = EINVAL; 402 goto fail; 403 } 404 } else 405 s = NULL; 406 407 /* 408 * Either open the requested tap(4) device or get 409 * the next available one. Don't set O_CLOEXEC as these 410 * should be closed by the vm process during virtio device 411 * launch. 412 */ 413 if (s != NULL) { 414 snprintf(path, PATH_MAX, "/dev/%s", s); 415 tapfds[i] = open(path, O_RDWR | O_NONBLOCK); 416 } else { 417 tapfds[i] = opentap(ifname); 418 s = ifname; 419 } 420 if (tapfds[i] == -1) { 421 ret = errno; 422 log_warnx("%s: can't open /dev/%s", __func__, s); 423 goto fail; 424 } 425 if ((vif->vif_name = strdup(s)) == NULL) { 426 log_warn("%s: can't save tap %s", __func__, s); 427 goto fail; 428 } 429 430 /* Check if the the interface is attached to a switch */ 431 s = vmc->vmc_ifswitch[i]; 432 if (*s != '\0') { 433 if ((vif->vif_switch = strdup(s)) == NULL) { 434 log_warn("%s: can't save switch %s", 435 __func__, s); 436 goto fail; 437 } 438 } 439 440 /* Check if the the interface is assigned to a group */ 441 s = vmc->vmc_ifgroup[i]; 442 if (*s != '\0') { 443 if ((vif->vif_group = strdup(s)) == NULL) { 444 log_warn("%s: can't save group %s", 445 __func__, s); 446 goto fail; 447 } 448 } 449 450 /* non-default rdomain (requires VMIFF_RDOMAIN below) */ 451 vif->vif_rdomain = vmc->vmc_ifrdomain[i]; 452 453 /* Set the interface status */ 454 vif->vif_flags = 455 vmc->vmc_ifflags[i] & (VMIFF_UP|VMIFF_OPTMASK); 456 } 457 458 /* 459 * Open TTY. Duplicate the fd before sending so the privileged parent 460 * process can perform permissions cleanup of the pty on vm termination. 461 */ 462 if (vm->vm_ttyname[0] == '\0') { 463 if (vm_opentty(vm) == -1) { 464 log_warn("%s: can't open tty %s", __func__, 465 vm->vm_ttyname[0] == '\0' ? "" : vm->vm_ttyname); 466 goto fail; 467 } 468 } 469 if ((fd = dup(vm->vm_tty)) == -1) { 470 log_warn("%s: can't re-open tty %s", __func__, vm->vm_ttyname); 471 goto fail; 472 } 473 474 /* Send VM information */ 475 /* XXX check proc_compose_imsg return values */ 476 if (vm->vm_state & VM_STATE_RECEIVED) 477 proc_compose_imsg(ps, PROC_VMM, -1, 478 IMSG_VMDOP_RECEIVE_VM_REQUEST, vm->vm_vmid, fd, vmc, 479 sizeof(struct vmop_create_params)); 480 else 481 proc_compose_imsg(ps, PROC_VMM, -1, 482 IMSG_VMDOP_START_VM_REQUEST, vm->vm_vmid, vm->vm_kernel, 483 vmc, sizeof(*vmc)); 484 485 if (strlen(vmc->vmc_cdrom)) 486 proc_compose_imsg(ps, PROC_VMM, -1, 487 IMSG_VMDOP_START_VM_CDROM, vm->vm_vmid, cdromfd, 488 NULL, 0); 489 490 for (i = 0; i < vmc->vmc_ndisks; i++) { 491 for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) { 492 if (diskfds[i][j] == -1) 493 break; 494 proc_compose_imsg(ps, PROC_VMM, -1, 495 IMSG_VMDOP_START_VM_DISK, vm->vm_vmid, 496 diskfds[i][j], &i, sizeof(i)); 497 } 498 } 499 for (i = 0; i < vmc->vmc_nnics; i++) { 500 proc_compose_imsg(ps, PROC_VMM, -1, 501 IMSG_VMDOP_START_VM_IF, vm->vm_vmid, tapfds[i], 502 &i, sizeof(i)); 503 504 memset(&var, 0, sizeof(var)); 505 var.var_vmid = vm->vm_vmid; 506 var.var_nic_idx = i; 507 proc_compose_imsg(ps, PROC_PRIV, -1, IMSG_VMDOP_PRIV_GET_ADDR, 508 vm->vm_vmid, dup(tapfds[i]), &var, sizeof(var)); 509 } 510 511 if (!(vm->vm_state & VM_STATE_RECEIVED)) 512 proc_compose_imsg(ps, PROC_VMM, -1, 513 IMSG_VMDOP_START_VM_END, vm->vm_vmid, fd, NULL, 0); 514 515 free(tapfds); 516 517 /* Collapse any memranges after the vm was sent to PROC_VMM */ 518 if (vcp->vcp_nmemranges > 0) { 519 for (i = 0; i < vcp->vcp_nmemranges; i++) 520 bytes += vcp->vcp_memranges[i].vmr_size; 521 memset(&vcp->vcp_memranges, 0, sizeof(vcp->vcp_memranges)); 522 vcp->vcp_nmemranges = 0; 523 vcp->vcp_memranges[0].vmr_size = bytes; 524 } 525 vm->vm_state |= VM_STATE_RUNNING; 526 return (0); 527 528 fail: 529 log_warnx("failed to start vm %s", vcp->vcp_name); 530 531 if (vm->vm_kernel != -1) 532 close(kernfd); 533 if (cdromfd != -1) 534 close(cdromfd); 535 for (i = 0; i < vmc->vmc_ndisks; i++) 536 for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) 537 if (diskfds[i][j] != -1) 538 close(diskfds[i][j]); 539 if (tapfds != NULL) { 540 for (i = 0; i < vmc->vmc_nnics; i++) 541 close(tapfds[i]); 542 free(tapfds); 543 } 544 545 if (vm->vm_from_config) { 546 vm_stop(vm, 0, __func__); 547 } else { 548 vm_remove(vm, __func__); 549 } 550 551 return (ret); 552 } 553 554 int 555 config_getvm(struct privsep *ps, struct imsg *imsg) 556 { 557 struct vmop_create_params vmc; 558 struct vmd_vm *vm = NULL; 559 int fd; 560 561 IMSG_SIZE_CHECK(imsg, &vmc); 562 memcpy(&vmc, imsg->data, sizeof(vmc)); 563 fd = imsg_get_fd(imsg); 564 vmc.vmc_kernel = fd; 565 566 errno = 0; 567 if (vm_register(ps, &vmc, &vm, imsg->hdr.peerid, 0) == -1) 568 goto fail; 569 570 vm->vm_state |= VM_STATE_RUNNING; 571 vm->vm_peerid = (uint32_t)-1; 572 vm->vm_kernel = fd; 573 return (0); 574 575 fail: 576 if (fd != -1) 577 close(fd); 578 579 vm_remove(vm, __func__); 580 if (errno == 0) 581 errno = EINVAL; 582 583 return (-1); 584 } 585 586 int 587 config_getdisk(struct privsep *ps, struct imsg *imsg) 588 { 589 struct vmd_vm *vm; 590 unsigned int n, idx; 591 int fd; 592 593 errno = 0; 594 if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) { 595 errno = ENOENT; 596 return (-1); 597 } 598 599 IMSG_SIZE_CHECK(imsg, &n); 600 memcpy(&n, imsg->data, sizeof(n)); 601 fd = imsg_get_fd(imsg); 602 603 if (n >= vm->vm_params.vmc_ndisks || fd == -1) { 604 log_warnx("invalid disk id"); 605 errno = EINVAL; 606 return (-1); 607 } 608 idx = vm->vm_params.vmc_diskbases[n]++; 609 if (idx >= VM_MAX_BASE_PER_DISK) { 610 log_warnx("too many bases for disk"); 611 errno = EINVAL; 612 return (-1); 613 } 614 vm->vm_disks[n][idx] = fd; 615 return (0); 616 } 617 618 int 619 config_getif(struct privsep *ps, struct imsg *imsg) 620 { 621 struct vmd_vm *vm; 622 unsigned int n; 623 int fd; 624 625 errno = 0; 626 if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) { 627 errno = ENOENT; 628 return (-1); 629 } 630 631 IMSG_SIZE_CHECK(imsg, &n); 632 memcpy(&n, imsg->data, sizeof(n)); 633 fd = imsg_get_fd(imsg); 634 635 if (n >= vm->vm_params.vmc_nnics || 636 vm->vm_ifs[n].vif_fd != -1 || fd == -1) { 637 log_warnx("invalid interface id"); 638 goto fail; 639 } 640 vm->vm_ifs[n].vif_fd = fd; 641 return (0); 642 fail: 643 if (fd != -1) 644 close(fd); 645 errno = EINVAL; 646 return (-1); 647 } 648 649 int 650 config_getcdrom(struct privsep *ps, struct imsg *imsg) 651 { 652 struct vmd_vm *vm; 653 int fd; 654 655 errno = 0; 656 if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) { 657 errno = ENOENT; 658 return (-1); 659 } 660 661 fd = imsg_get_fd(imsg); 662 if (fd == -1) { 663 log_warnx("invalid cdrom id"); 664 goto fail; 665 } 666 667 vm->vm_cdrom = fd; 668 return (0); 669 fail: 670 errno = EINVAL; 671 return (-1); 672 } 673