1 /* $OpenBSD: virtio.c,v 1.123 2025/01/08 15:46:10 dv Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/param.h> /* PAGE_SIZE */ 20 #include <sys/socket.h> 21 #include <sys/wait.h> 22 23 #include <dev/pci/pcireg.h> 24 #include <dev/pci/pcidevs.h> 25 #include <dev/pv/virtioreg.h> 26 #include <dev/pci/virtio_pcireg.h> 27 #include <dev/pv/vioblkreg.h> 28 #include <dev/vmm/vmm.h> 29 30 #include <net/if.h> 31 #include <netinet/in.h> 32 #include <netinet/if_ether.h> 33 34 #include <errno.h> 35 #include <event.h> 36 #include <stdlib.h> 37 #include <string.h> 38 #include <unistd.h> 39 40 #include "atomicio.h" 41 #include "pci.h" 42 #include "vioscsi.h" 43 #include "virtio.h" 44 #include "vmd.h" 45 46 extern struct vmd *env; 47 extern char *__progname; 48 49 struct viornd_dev viornd; 50 struct vioscsi_dev *vioscsi; 51 struct vmmci_dev vmmci; 52 53 /* Devices emulated in subprocesses are inserted into this list. */ 54 SLIST_HEAD(virtio_dev_head, virtio_dev) virtio_devs; 55 56 #define MAXPHYS (64 * 1024) /* max raw I/O transfer size */ 57 58 #define VIRTIO_NET_F_MAC (1<<5) 59 60 #define VMMCI_F_TIMESYNC (1<<0) 61 #define VMMCI_F_ACK (1<<1) 62 #define VMMCI_F_SYNCRTC (1<<2) 63 64 #define RXQ 0 65 #define TXQ 1 66 67 static int virtio_dev_launch(struct vmd_vm *, struct virtio_dev *); 68 static void virtio_dispatch_dev(int, short, void *); 69 static int handle_dev_msg(struct viodev_msg *, struct virtio_dev *); 70 static int virtio_dev_closefds(struct virtio_dev *); 71 static void vmmci_pipe_dispatch(int, short, void *); 72 73 const char * 74 virtio_reg_name(uint8_t reg) 75 { 76 switch (reg) { 77 case VIRTIO_CONFIG_DEVICE_FEATURES: return "device feature"; 78 case VIRTIO_CONFIG_GUEST_FEATURES: return "guest feature"; 79 case VIRTIO_CONFIG_QUEUE_PFN: return "queue address"; 80 case VIRTIO_CONFIG_QUEUE_SIZE: return "queue size"; 81 case VIRTIO_CONFIG_QUEUE_SELECT: return "queue select"; 82 case VIRTIO_CONFIG_QUEUE_NOTIFY: return "queue notify"; 83 case VIRTIO_CONFIG_DEVICE_STATUS: return "device status"; 84 case VIRTIO_CONFIG_ISR_STATUS: return "isr status"; 85 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI...VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3: 86 return "device config 0"; 87 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: 88 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5: 89 return "device config 1"; 90 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: return "device config 2"; 91 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: return "device config 3"; 92 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: return "device config 4"; 93 default: return "unknown"; 94 } 95 } 96 97 uint32_t 98 vring_size(uint32_t vq_size) 99 { 100 uint32_t allocsize1, allocsize2; 101 102 /* allocsize1: descriptor table + avail ring + pad */ 103 allocsize1 = VIRTQUEUE_ALIGN(sizeof(struct vring_desc) * vq_size 104 + sizeof(uint16_t) * (2 + vq_size)); 105 /* allocsize2: used ring + pad */ 106 allocsize2 = VIRTQUEUE_ALIGN(sizeof(uint16_t) * 2 107 + sizeof(struct vring_used_elem) * vq_size); 108 109 return allocsize1 + allocsize2; 110 } 111 112 /* Update queue select */ 113 void 114 viornd_update_qs(void) 115 { 116 struct virtio_vq_info *vq_info; 117 118 /* Invalid queue? */ 119 if (viornd.cfg.queue_select > 0) { 120 viornd.cfg.queue_size = 0; 121 return; 122 } 123 124 vq_info = &viornd.vq[viornd.cfg.queue_select]; 125 126 /* Update queue pfn/size based on queue select */ 127 viornd.cfg.queue_pfn = vq_info->q_gpa >> 12; 128 viornd.cfg.queue_size = vq_info->qs; 129 } 130 131 /* Update queue address */ 132 void 133 viornd_update_qa(void) 134 { 135 struct virtio_vq_info *vq_info; 136 void *hva = NULL; 137 138 /* Invalid queue? */ 139 if (viornd.cfg.queue_select > 0) 140 return; 141 142 vq_info = &viornd.vq[viornd.cfg.queue_select]; 143 vq_info->q_gpa = (uint64_t)viornd.cfg.queue_pfn * VIRTIO_PAGE_SIZE; 144 145 hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIORND_QUEUE_SIZE)); 146 if (hva == NULL) 147 fatalx("viornd_update_qa"); 148 vq_info->q_hva = hva; 149 } 150 151 int 152 viornd_notifyq(void) 153 { 154 size_t sz; 155 int dxx, ret; 156 uint16_t aidx, uidx; 157 char *vr, *rnd_data; 158 struct vring_desc *desc; 159 struct vring_avail *avail; 160 struct vring_used *used; 161 struct virtio_vq_info *vq_info; 162 163 ret = 0; 164 165 /* Invalid queue? */ 166 if (viornd.cfg.queue_notify > 0) 167 return (0); 168 169 vq_info = &viornd.vq[viornd.cfg.queue_notify]; 170 vr = vq_info->q_hva; 171 if (vr == NULL) 172 fatalx("%s: null vring", __func__); 173 174 desc = (struct vring_desc *)(vr); 175 avail = (struct vring_avail *)(vr + vq_info->vq_availoffset); 176 used = (struct vring_used *)(vr + vq_info->vq_usedoffset); 177 178 aidx = avail->idx & VIORND_QUEUE_MASK; 179 uidx = used->idx & VIORND_QUEUE_MASK; 180 181 dxx = avail->ring[aidx] & VIORND_QUEUE_MASK; 182 183 sz = desc[dxx].len; 184 if (sz > MAXPHYS) 185 fatalx("viornd descriptor size too large (%zu)", sz); 186 187 rnd_data = malloc(sz); 188 189 if (rnd_data != NULL) { 190 arc4random_buf(rnd_data, sz); 191 if (write_mem(desc[dxx].addr, rnd_data, sz)) { 192 log_warnx("viornd: can't write random data @ " 193 "0x%llx", 194 desc[dxx].addr); 195 } else { 196 /* ret == 1 -> interrupt needed */ 197 /* XXX check VIRTIO_F_NO_INTR */ 198 ret = 1; 199 viornd.cfg.isr_status = 1; 200 used->ring[uidx].id = dxx; 201 used->ring[uidx].len = sz; 202 __sync_synchronize(); 203 used->idx++; 204 } 205 free(rnd_data); 206 } else 207 fatal("memory allocation error for viornd data"); 208 209 return (ret); 210 } 211 212 int 213 virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, 214 void *unused, uint8_t sz) 215 { 216 *intr = 0xFF; 217 218 if (dir == 0) { 219 switch (reg) { 220 case VIRTIO_CONFIG_DEVICE_FEATURES: 221 case VIRTIO_CONFIG_QUEUE_SIZE: 222 case VIRTIO_CONFIG_ISR_STATUS: 223 log_warnx("%s: illegal write %x to %s", 224 __progname, *data, virtio_reg_name(reg)); 225 break; 226 case VIRTIO_CONFIG_GUEST_FEATURES: 227 viornd.cfg.guest_feature = *data; 228 break; 229 case VIRTIO_CONFIG_QUEUE_PFN: 230 viornd.cfg.queue_pfn = *data; 231 viornd_update_qa(); 232 break; 233 case VIRTIO_CONFIG_QUEUE_SELECT: 234 viornd.cfg.queue_select = *data; 235 viornd_update_qs(); 236 break; 237 case VIRTIO_CONFIG_QUEUE_NOTIFY: 238 viornd.cfg.queue_notify = *data; 239 if (viornd_notifyq()) 240 *intr = 1; 241 break; 242 case VIRTIO_CONFIG_DEVICE_STATUS: 243 viornd.cfg.device_status = *data; 244 break; 245 } 246 } else { 247 switch (reg) { 248 case VIRTIO_CONFIG_DEVICE_FEATURES: 249 *data = viornd.cfg.device_feature; 250 break; 251 case VIRTIO_CONFIG_GUEST_FEATURES: 252 *data = viornd.cfg.guest_feature; 253 break; 254 case VIRTIO_CONFIG_QUEUE_PFN: 255 *data = viornd.cfg.queue_pfn; 256 break; 257 case VIRTIO_CONFIG_QUEUE_SIZE: 258 *data = viornd.cfg.queue_size; 259 break; 260 case VIRTIO_CONFIG_QUEUE_SELECT: 261 *data = viornd.cfg.queue_select; 262 break; 263 case VIRTIO_CONFIG_QUEUE_NOTIFY: 264 *data = viornd.cfg.queue_notify; 265 break; 266 case VIRTIO_CONFIG_DEVICE_STATUS: 267 *data = viornd.cfg.device_status; 268 break; 269 case VIRTIO_CONFIG_ISR_STATUS: 270 *data = viornd.cfg.isr_status; 271 viornd.cfg.isr_status = 0; 272 vcpu_deassert_irq(viornd.vm_id, 0, viornd.irq); 273 break; 274 } 275 } 276 return (0); 277 } 278 279 /* 280 * vmmci_ctl 281 * 282 * Inject a command into the vmmci device, potentially delivering interrupt. 283 * 284 * Called by the vm process's event(3) loop. 285 */ 286 int 287 vmmci_ctl(unsigned int cmd) 288 { 289 int ret = 0; 290 struct timeval tv = { 0, 0 }; 291 292 mutex_lock(&vmmci.mutex); 293 294 if ((vmmci.cfg.device_status & 295 VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0) { 296 ret = -1; 297 goto unlock; 298 } 299 300 if (cmd == vmmci.cmd) 301 goto unlock; 302 303 switch (cmd) { 304 case VMMCI_NONE: 305 break; 306 case VMMCI_SHUTDOWN: 307 case VMMCI_REBOOT: 308 /* Update command */ 309 vmmci.cmd = cmd; 310 311 /* 312 * vmm VMs do not support powerdown, send a reboot request 313 * instead and turn it off after the triple fault. 314 */ 315 if (cmd == VMMCI_SHUTDOWN) 316 cmd = VMMCI_REBOOT; 317 318 /* Trigger interrupt */ 319 vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE; 320 vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq); 321 322 /* Add ACK timeout */ 323 tv.tv_sec = VMMCI_TIMEOUT_SHORT; 324 evtimer_add(&vmmci.timeout, &tv); 325 break; 326 case VMMCI_SYNCRTC: 327 if (vmmci.cfg.guest_feature & VMMCI_F_SYNCRTC) { 328 /* RTC updated, request guest VM resync of its RTC */ 329 vmmci.cmd = cmd; 330 331 vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE; 332 vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq); 333 } else { 334 log_debug("%s: RTC sync skipped (guest does not " 335 "support RTC sync)\n", __func__); 336 } 337 break; 338 default: 339 fatalx("invalid vmmci command: %d", cmd); 340 } 341 342 unlock: 343 mutex_unlock(&vmmci.mutex); 344 345 return (ret); 346 } 347 348 /* 349 * vmmci_ack 350 * 351 * Process a write to the command register. 352 * 353 * Called by the vcpu thread. Must be called with the mutex held. 354 */ 355 void 356 vmmci_ack(unsigned int cmd) 357 { 358 switch (cmd) { 359 case VMMCI_NONE: 360 break; 361 case VMMCI_SHUTDOWN: 362 /* 363 * The shutdown was requested by the VM if we don't have 364 * a pending shutdown request. In this case add a short 365 * timeout to give the VM a chance to reboot before the 366 * timer is expired. 367 */ 368 if (vmmci.cmd == 0) { 369 log_debug("%s: vm %u requested shutdown", __func__, 370 vmmci.vm_id); 371 vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_SHORT); 372 return; 373 } 374 /* FALLTHROUGH */ 375 case VMMCI_REBOOT: 376 /* 377 * If the VM acknowledged our shutdown request, give it 378 * enough time to shutdown or reboot gracefully. This 379 * might take a considerable amount of time (running 380 * rc.shutdown on the VM), so increase the timeout before 381 * killing it forcefully. 382 */ 383 if (cmd == vmmci.cmd) { 384 log_debug("%s: vm %u acknowledged shutdown request", 385 __func__, vmmci.vm_id); 386 vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_LONG); 387 } 388 break; 389 case VMMCI_SYNCRTC: 390 log_debug("%s: vm %u acknowledged RTC sync request", 391 __func__, vmmci.vm_id); 392 vmmci.cmd = VMMCI_NONE; 393 break; 394 default: 395 log_warnx("%s: illegal request %u", __func__, cmd); 396 break; 397 } 398 } 399 400 void 401 vmmci_timeout(int fd, short type, void *arg) 402 { 403 log_debug("%s: vm %u shutdown", __progname, vmmci.vm_id); 404 vm_shutdown(vmmci.cmd == VMMCI_REBOOT ? VMMCI_REBOOT : VMMCI_SHUTDOWN); 405 } 406 407 int 408 vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, 409 void *unused, uint8_t sz) 410 { 411 *intr = 0xFF; 412 413 mutex_lock(&vmmci.mutex); 414 if (dir == 0) { 415 switch (reg) { 416 case VIRTIO_CONFIG_DEVICE_FEATURES: 417 case VIRTIO_CONFIG_QUEUE_SIZE: 418 case VIRTIO_CONFIG_ISR_STATUS: 419 log_warnx("%s: illegal write %x to %s", 420 __progname, *data, virtio_reg_name(reg)); 421 break; 422 case VIRTIO_CONFIG_GUEST_FEATURES: 423 vmmci.cfg.guest_feature = *data; 424 break; 425 case VIRTIO_CONFIG_QUEUE_PFN: 426 vmmci.cfg.queue_pfn = *data; 427 break; 428 case VIRTIO_CONFIG_QUEUE_SELECT: 429 vmmci.cfg.queue_select = *data; 430 break; 431 case VIRTIO_CONFIG_QUEUE_NOTIFY: 432 vmmci.cfg.queue_notify = *data; 433 break; 434 case VIRTIO_CONFIG_DEVICE_STATUS: 435 vmmci.cfg.device_status = *data; 436 break; 437 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI: 438 vmmci_ack(*data); 439 break; 440 } 441 } else { 442 switch (reg) { 443 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI: 444 *data = vmmci.cmd; 445 break; 446 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: 447 /* Update time once when reading the first register */ 448 gettimeofday(&vmmci.time, NULL); 449 *data = (uint64_t)vmmci.time.tv_sec; 450 break; 451 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: 452 *data = (uint64_t)vmmci.time.tv_sec << 32; 453 break; 454 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: 455 *data = (uint64_t)vmmci.time.tv_usec; 456 break; 457 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: 458 *data = (uint64_t)vmmci.time.tv_usec << 32; 459 break; 460 case VIRTIO_CONFIG_DEVICE_FEATURES: 461 *data = vmmci.cfg.device_feature; 462 break; 463 case VIRTIO_CONFIG_GUEST_FEATURES: 464 *data = vmmci.cfg.guest_feature; 465 break; 466 case VIRTIO_CONFIG_QUEUE_PFN: 467 *data = vmmci.cfg.queue_pfn; 468 break; 469 case VIRTIO_CONFIG_QUEUE_SIZE: 470 *data = vmmci.cfg.queue_size; 471 break; 472 case VIRTIO_CONFIG_QUEUE_SELECT: 473 *data = vmmci.cfg.queue_select; 474 break; 475 case VIRTIO_CONFIG_QUEUE_NOTIFY: 476 *data = vmmci.cfg.queue_notify; 477 break; 478 case VIRTIO_CONFIG_DEVICE_STATUS: 479 *data = vmmci.cfg.device_status; 480 break; 481 case VIRTIO_CONFIG_ISR_STATUS: 482 *data = vmmci.cfg.isr_status; 483 vmmci.cfg.isr_status = 0; 484 vcpu_deassert_irq(vmmci.vm_id, 0, vmmci.irq); 485 break; 486 } 487 } 488 mutex_unlock(&vmmci.mutex); 489 490 return (0); 491 } 492 493 int 494 virtio_get_base(int fd, char *path, size_t npath, int type, const char *dpath) 495 { 496 switch (type) { 497 case VMDF_RAW: 498 return 0; 499 case VMDF_QCOW2: 500 return virtio_qcow2_get_base(fd, path, npath, dpath); 501 } 502 log_warnx("%s: invalid disk format", __func__); 503 return -1; 504 } 505 506 static void 507 vmmci_pipe_dispatch(int fd, short event, void *arg) 508 { 509 enum pipe_msg_type msg; 510 struct timeval tv = { 0, 0 }; 511 512 msg = vm_pipe_recv(&vmmci.dev_pipe); 513 switch (msg) { 514 case VMMCI_SET_TIMEOUT_SHORT: 515 tv.tv_sec = VMMCI_TIMEOUT_SHORT; 516 evtimer_add(&vmmci.timeout, &tv); 517 break; 518 case VMMCI_SET_TIMEOUT_LONG: 519 tv.tv_sec = VMMCI_TIMEOUT_LONG; 520 evtimer_add(&vmmci.timeout, &tv); 521 break; 522 default: 523 log_warnx("%s: invalid pipe message type %d", __func__, msg); 524 } 525 } 526 527 void 528 virtio_init(struct vmd_vm *vm, int child_cdrom, 529 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 530 { 531 struct vmop_create_params *vmc = &vm->vm_params; 532 struct vm_create_params *vcp = &vmc->vmc_params; 533 struct virtio_dev *dev; 534 uint8_t id; 535 uint8_t i, j; 536 int ret = 0; 537 538 /* Virtio entropy device */ 539 if (pci_add_device(&id, PCI_VENDOR_QUMRANET, 540 PCI_PRODUCT_QUMRANET_VIO_RNG, PCI_CLASS_SYSTEM, 541 PCI_SUBCLASS_SYSTEM_MISC, 542 PCI_VENDOR_OPENBSD, 543 PCI_PRODUCT_VIRTIO_ENTROPY, 1, NULL)) { 544 log_warnx("%s: can't add PCI virtio rng device", 545 __progname); 546 return; 547 } 548 549 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_rnd_io, NULL)) { 550 log_warnx("%s: can't add bar for virtio rng device", 551 __progname); 552 return; 553 } 554 555 memset(&viornd, 0, sizeof(viornd)); 556 viornd.vq[0].qs = VIORND_QUEUE_SIZE; 557 viornd.vq[0].vq_availoffset = sizeof(struct vring_desc) * 558 VIORND_QUEUE_SIZE; 559 viornd.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN( 560 sizeof(struct vring_desc) * VIORND_QUEUE_SIZE 561 + sizeof(uint16_t) * (2 + VIORND_QUEUE_SIZE)); 562 viornd.pci_id = id; 563 viornd.irq = pci_get_dev_irq(id); 564 viornd.vm_id = vcp->vcp_id; 565 566 SLIST_INIT(&virtio_devs); 567 568 if (vmc->vmc_nnics > 0) { 569 for (i = 0; i < vmc->vmc_nnics; i++) { 570 dev = calloc(1, sizeof(struct virtio_dev)); 571 if (dev == NULL) { 572 log_warn("%s: calloc failure allocating vionet", 573 __progname); 574 return; 575 } 576 /* Virtio network */ 577 dev->dev_type = VMD_DEVTYPE_NET; 578 579 if (pci_add_device(&id, PCI_VENDOR_QUMRANET, 580 PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM, 581 PCI_SUBCLASS_SYSTEM_MISC, PCI_VENDOR_OPENBSD, 582 PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL)) { 583 log_warnx("%s: can't add PCI virtio net device", 584 __progname); 585 return; 586 } 587 dev->pci_id = id; 588 dev->sync_fd = -1; 589 dev->async_fd = -1; 590 dev->vm_id = vcp->vcp_id; 591 dev->vm_vmid = vm->vm_vmid; 592 dev->irq = pci_get_dev_irq(id); 593 594 /* The vionet pci bar function is called by the vcpu. */ 595 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io, 596 dev)) { 597 log_warnx("%s: can't add bar for virtio net " 598 "device", __progname); 599 return; 600 } 601 602 dev->vionet.vq[RXQ].qs = VIONET_QUEUE_SIZE; 603 dev->vionet.vq[RXQ].vq_availoffset = 604 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE; 605 dev->vionet.vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN( 606 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE 607 + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE)); 608 dev->vionet.vq[RXQ].last_avail = 0; 609 dev->vionet.vq[RXQ].notified_avail = 0; 610 611 dev->vionet.vq[TXQ].qs = VIONET_QUEUE_SIZE; 612 dev->vionet.vq[TXQ].vq_availoffset = 613 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE; 614 dev->vionet.vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN( 615 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE 616 + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE)); 617 dev->vionet.vq[TXQ].last_avail = 0; 618 dev->vionet.vq[TXQ].notified_avail = 0; 619 620 dev->vionet.data_fd = child_taps[i]; 621 622 /* MAC address has been assigned by the parent */ 623 memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6); 624 dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC; 625 626 dev->vionet.lockedmac = 627 vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0; 628 dev->vionet.local = 629 vmc->vmc_ifflags[i] & VMIFF_LOCAL ? 1 : 0; 630 if (i == 0 && vmc->vmc_bootdevice & VMBOOTDEV_NET) 631 dev->vionet.pxeboot = 1; 632 memcpy(&dev->vionet.local_prefix, 633 &env->vmd_cfg.cfg_localprefix, 634 sizeof(dev->vionet.local_prefix)); 635 log_debug("%s: vm \"%s\" vio%u lladdr %s%s%s%s", 636 __func__, vcp->vcp_name, i, 637 ether_ntoa((void *)dev->vionet.mac), 638 dev->vionet.lockedmac ? ", locked" : "", 639 dev->vionet.local ? ", local" : "", 640 dev->vionet.pxeboot ? ", pxeboot" : ""); 641 642 /* Add the vionet to our device list. */ 643 dev->vionet.idx = i; 644 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next); 645 } 646 } 647 648 if (vmc->vmc_ndisks > 0) { 649 for (i = 0; i < vmc->vmc_ndisks; i++) { 650 dev = calloc(1, sizeof(struct virtio_dev)); 651 if (dev == NULL) { 652 log_warn("%s: calloc failure allocating vioblk", 653 __progname); 654 return; 655 } 656 657 /* One vioblk device for each disk defined in vcp */ 658 dev->dev_type = VMD_DEVTYPE_DISK; 659 660 if (pci_add_device(&id, PCI_VENDOR_QUMRANET, 661 PCI_PRODUCT_QUMRANET_VIO_BLOCK, 662 PCI_CLASS_MASS_STORAGE, 663 PCI_SUBCLASS_MASS_STORAGE_SCSI, 664 PCI_VENDOR_OPENBSD, 665 PCI_PRODUCT_VIRTIO_BLOCK, 1, NULL)) { 666 log_warnx("%s: can't add PCI virtio block " 667 "device", __progname); 668 return; 669 } 670 dev->pci_id = id; 671 dev->sync_fd = -1; 672 dev->async_fd = -1; 673 dev->vm_id = vcp->vcp_id; 674 dev->vm_vmid = vm->vm_vmid; 675 dev->irq = pci_get_dev_irq(id); 676 677 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io, 678 &dev->vioblk)) { 679 log_warnx("%s: can't add bar for virtio block " 680 "device", __progname); 681 return; 682 } 683 dev->vioblk.vq[0].qs = VIOBLK_QUEUE_SIZE; 684 dev->vioblk.vq[0].vq_availoffset = 685 sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE; 686 dev->vioblk.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN( 687 sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE 688 + sizeof(uint16_t) * (2 + VIOBLK_QUEUE_SIZE)); 689 dev->vioblk.vq[0].last_avail = 0; 690 dev->vioblk.cfg.device_feature = 691 VIRTIO_BLK_F_SEG_MAX; 692 dev->vioblk.seg_max = VIOBLK_SEG_MAX; 693 694 /* 695 * Initialize disk fds to an invalid fd (-1), then 696 * set any child disk fds. 697 */ 698 memset(&dev->vioblk.disk_fd, -1, 699 sizeof(dev->vioblk.disk_fd)); 700 dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i]; 701 for (j = 0; j < dev->vioblk.ndisk_fd; j++) 702 dev->vioblk.disk_fd[j] = child_disks[i][j]; 703 704 dev->vioblk.idx = i; 705 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next); 706 } 707 } 708 709 /* 710 * Launch virtio devices that support subprocess execution. 711 */ 712 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 713 if (virtio_dev_launch(vm, dev) != 0) 714 fatalx("failed to launch virtio device"); 715 } 716 717 /* vioscsi cdrom */ 718 if (strlen(vmc->vmc_cdrom)) { 719 vioscsi = calloc(1, sizeof(struct vioscsi_dev)); 720 if (vioscsi == NULL) { 721 log_warn("%s: calloc failure allocating vioscsi", 722 __progname); 723 return; 724 } 725 726 if (pci_add_device(&id, PCI_VENDOR_QUMRANET, 727 PCI_PRODUCT_QUMRANET_VIO_SCSI, 728 PCI_CLASS_MASS_STORAGE, 729 PCI_SUBCLASS_MASS_STORAGE_SCSI, 730 PCI_VENDOR_OPENBSD, 731 PCI_PRODUCT_VIRTIO_SCSI, 1, NULL)) { 732 log_warnx("%s: can't add PCI vioscsi device", 733 __progname); 734 return; 735 } 736 737 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vioscsi_io, vioscsi)) { 738 log_warnx("%s: can't add bar for vioscsi device", 739 __progname); 740 return; 741 } 742 743 for (i = 0; i < VIRTIO_MAX_QUEUES; i++) { 744 vioscsi->vq[i].qs = VIOSCSI_QUEUE_SIZE; 745 vioscsi->vq[i].vq_availoffset = 746 sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE; 747 vioscsi->vq[i].vq_usedoffset = VIRTQUEUE_ALIGN( 748 sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE 749 + sizeof(uint16_t) * (2 + VIOSCSI_QUEUE_SIZE)); 750 vioscsi->vq[i].last_avail = 0; 751 } 752 if (virtio_raw_init(&vioscsi->file, &vioscsi->sz, &child_cdrom, 753 1) == -1) { 754 log_warnx("%s: unable to determine iso format", 755 __func__); 756 return; 757 } 758 vioscsi->locked = 0; 759 vioscsi->lba = 0; 760 vioscsi->n_blocks = vioscsi->sz / VIOSCSI_BLOCK_SIZE_CDROM; 761 vioscsi->max_xfer = VIOSCSI_BLOCK_SIZE_CDROM; 762 vioscsi->pci_id = id; 763 vioscsi->vm_id = vcp->vcp_id; 764 vioscsi->irq = pci_get_dev_irq(id); 765 } 766 767 /* virtio control device */ 768 if (pci_add_device(&id, PCI_VENDOR_OPENBSD, 769 PCI_PRODUCT_OPENBSD_CONTROL, 770 PCI_CLASS_COMMUNICATIONS, 771 PCI_SUBCLASS_COMMUNICATIONS_MISC, 772 PCI_VENDOR_OPENBSD, 773 PCI_PRODUCT_VIRTIO_VMMCI, 1, NULL)) { 774 log_warnx("%s: can't add PCI vmm control device", 775 __progname); 776 return; 777 } 778 779 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vmmci_io, NULL)) { 780 log_warnx("%s: can't add bar for vmm control device", 781 __progname); 782 return; 783 } 784 785 memset(&vmmci, 0, sizeof(vmmci)); 786 vmmci.cfg.device_feature = VMMCI_F_TIMESYNC | VMMCI_F_ACK | 787 VMMCI_F_SYNCRTC; 788 vmmci.vm_id = vcp->vcp_id; 789 vmmci.irq = pci_get_dev_irq(id); 790 vmmci.pci_id = id; 791 ret = pthread_mutex_init(&vmmci.mutex, NULL); 792 if (ret) { 793 errno = ret; 794 fatal("could not initialize vmmci mutex"); 795 } 796 797 evtimer_set(&vmmci.timeout, vmmci_timeout, NULL); 798 vm_pipe_init(&vmmci.dev_pipe, vmmci_pipe_dispatch); 799 event_add(&vmmci.dev_pipe.read_ev, NULL); 800 } 801 802 /* 803 * vionet_set_hostmac 804 * 805 * Sets the hardware address for the host-side tap(4) on a vionet_dev. 806 * 807 * This should only be called from the event-loop thread 808 * 809 * vm: pointer to the current vmd_vm instance 810 * idx: index into the array of vionet_dev's for the target vionet_dev 811 * addr: ethernet address to set 812 */ 813 void 814 vionet_set_hostmac(struct vmd_vm *vm, unsigned int idx, uint8_t *addr) 815 { 816 struct vmop_create_params *vmc = &vm->vm_params; 817 struct virtio_dev *dev; 818 struct vionet_dev *vionet = NULL; 819 int ret; 820 821 if (idx > vmc->vmc_nnics) 822 fatalx("%s: invalid vionet index: %u", __func__, idx); 823 824 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 825 if (dev->dev_type == VMD_DEVTYPE_NET 826 && dev->vionet.idx == idx) { 827 vionet = &dev->vionet; 828 break; 829 } 830 } 831 if (vionet == NULL) 832 fatalx("%s: dev == NULL, idx = %u", __func__, idx); 833 834 /* Set the local vm process copy. */ 835 memcpy(vionet->hostmac, addr, sizeof(vionet->hostmac)); 836 837 /* Send the information to the device process. */ 838 ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_HOSTMAC, 0, 0, -1, 839 vionet->hostmac, sizeof(vionet->hostmac)); 840 if (ret == -1) { 841 log_warnx("%s: failed to queue hostmac to vionet dev %u", 842 __func__, idx); 843 return; 844 } 845 } 846 847 void 848 virtio_shutdown(struct vmd_vm *vm) 849 { 850 int ret, status; 851 pid_t pid = 0; 852 struct virtio_dev *dev, *tmp; 853 struct viodev_msg msg; 854 struct imsgbuf *ibuf; 855 856 /* Ensure that our disks are synced. */ 857 if (vioscsi != NULL) 858 vioscsi->file.close(vioscsi->file.p, 0); 859 860 /* 861 * Broadcast shutdown to child devices. We need to do this 862 * synchronously as we have already stopped the async event thread. 863 */ 864 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 865 memset(&msg, 0, sizeof(msg)); 866 msg.type = VIODEV_MSG_SHUTDOWN; 867 ibuf = &dev->sync_iev.ibuf; 868 ret = imsg_compose(ibuf, VIODEV_MSG_SHUTDOWN, 0, 0, -1, 869 &msg, sizeof(msg)); 870 if (ret == -1) 871 fatalx("%s: failed to send shutdown to device", 872 __func__); 873 if (imsgbuf_flush(ibuf) == -1) 874 fatalx("%s: imsgbuf_flush", __func__); 875 } 876 877 /* 878 * Wait for all children to shutdown using a simple approach of 879 * iterating over known child devices and waiting for them to die. 880 */ 881 SLIST_FOREACH_SAFE(dev, &virtio_devs, dev_next, tmp) { 882 log_debug("%s: waiting on device pid %d", __func__, 883 dev->dev_pid); 884 do { 885 pid = waitpid(dev->dev_pid, &status, WNOHANG); 886 } while (pid == 0 || (pid == -1 && errno == EINTR)); 887 if (pid == dev->dev_pid) 888 log_debug("%s: device for pid %d is stopped", 889 __func__, pid); 890 else 891 log_warnx("%s: unexpected pid %d", __func__, pid); 892 free(dev); 893 } 894 } 895 896 int 897 vmmci_restore(int fd, uint32_t vm_id) 898 { 899 log_debug("%s: receiving vmmci", __func__); 900 if (atomicio(read, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) { 901 log_warnx("%s: error reading vmmci from fd", __func__); 902 return (-1); 903 } 904 905 if (pci_set_bar_fn(vmmci.pci_id, 0, vmmci_io, NULL)) { 906 log_warnx("%s: can't set bar fn for vmm control device", 907 __progname); 908 return (-1); 909 } 910 vmmci.vm_id = vm_id; 911 vmmci.irq = pci_get_dev_irq(vmmci.pci_id); 912 memset(&vmmci.timeout, 0, sizeof(struct event)); 913 evtimer_set(&vmmci.timeout, vmmci_timeout, NULL); 914 return (0); 915 } 916 917 int 918 viornd_restore(int fd, struct vmd_vm *vm) 919 { 920 void *hva = NULL; 921 922 log_debug("%s: receiving viornd", __func__); 923 if (atomicio(read, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) { 924 log_warnx("%s: error reading viornd from fd", __func__); 925 return (-1); 926 } 927 if (pci_set_bar_fn(viornd.pci_id, 0, virtio_rnd_io, NULL)) { 928 log_warnx("%s: can't set bar fn for virtio rng device", 929 __progname); 930 return (-1); 931 } 932 viornd.vm_id = vm->vm_params.vmc_params.vcp_id; 933 viornd.irq = pci_get_dev_irq(viornd.pci_id); 934 935 hva = hvaddr_mem(viornd.vq[0].q_gpa, vring_size(VIORND_QUEUE_SIZE)); 936 if (hva == NULL) 937 fatal("failed to restore viornd virtqueue"); 938 viornd.vq[0].q_hva = hva; 939 940 return (0); 941 } 942 943 int 944 vionet_restore(int fd, struct vmd_vm *vm, int *child_taps) 945 { 946 struct vmop_create_params *vmc = &vm->vm_params; 947 struct vm_create_params *vcp = &vmc->vmc_params; 948 struct virtio_dev *dev; 949 uint8_t i; 950 951 if (vmc->vmc_nnics == 0) 952 return (0); 953 954 for (i = 0; i < vmc->vmc_nnics; i++) { 955 dev = calloc(1, sizeof(struct virtio_dev)); 956 if (dev == NULL) { 957 log_warn("%s: calloc failure allocating vionet", 958 __progname); 959 return (-1); 960 } 961 962 log_debug("%s: receiving virtio network device", __func__); 963 if (atomicio(read, fd, dev, sizeof(struct virtio_dev)) 964 != sizeof(struct virtio_dev)) { 965 log_warnx("%s: error reading vionet from fd", 966 __func__); 967 return (-1); 968 } 969 970 /* Virtio network */ 971 if (dev->dev_type != VMD_DEVTYPE_NET) { 972 log_warnx("%s: invalid device type", __func__); 973 return (-1); 974 } 975 976 dev->sync_fd = -1; 977 dev->async_fd = -1; 978 dev->vm_id = vcp->vcp_id; 979 dev->vm_vmid = vm->vm_vmid; 980 dev->irq = pci_get_dev_irq(dev->pci_id); 981 982 if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) { 983 log_warnx("%s: can't set bar fn for virtio net " 984 "device", __progname); 985 return (-1); 986 } 987 988 dev->vionet.data_fd = child_taps[i]; 989 dev->vionet.idx = i; 990 991 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next); 992 } 993 994 return (0); 995 } 996 997 int 998 vioblk_restore(int fd, struct vmd_vm *vm, 999 int child_disks[][VM_MAX_BASE_PER_DISK]) 1000 { 1001 struct vmop_create_params *vmc = &vm->vm_params; 1002 struct virtio_dev *dev; 1003 uint8_t i, j; 1004 1005 if (vmc->vmc_ndisks == 0) 1006 return (0); 1007 1008 for (i = 0; i < vmc->vmc_ndisks; i++) { 1009 dev = calloc(1, sizeof(struct virtio_dev)); 1010 if (dev == NULL) { 1011 log_warn("%s: calloc failure allocating vioblks", 1012 __progname); 1013 return (-1); 1014 } 1015 1016 log_debug("%s: receiving vioblk", __func__); 1017 if (atomicio(read, fd, dev, sizeof(struct virtio_dev)) 1018 != sizeof(struct virtio_dev)) { 1019 log_warnx("%s: error reading vioblk from fd", __func__); 1020 return (-1); 1021 } 1022 if (dev->dev_type != VMD_DEVTYPE_DISK) { 1023 log_warnx("%s: invalid device type", __func__); 1024 return (-1); 1025 } 1026 1027 dev->sync_fd = -1; 1028 dev->async_fd = -1; 1029 1030 if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) { 1031 log_warnx("%s: can't set bar fn for virtio block " 1032 "device", __progname); 1033 return (-1); 1034 } 1035 dev->vm_id = vmc->vmc_params.vcp_id; 1036 dev->irq = pci_get_dev_irq(dev->pci_id); 1037 1038 memset(&dev->vioblk.disk_fd, -1, sizeof(dev->vioblk.disk_fd)); 1039 dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i]; 1040 for (j = 0; j < dev->vioblk.ndisk_fd; j++) 1041 dev->vioblk.disk_fd[j] = child_disks[i][j]; 1042 1043 dev->vioblk.idx = i; 1044 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next); 1045 } 1046 return (0); 1047 } 1048 1049 int 1050 vioscsi_restore(int fd, struct vmd_vm *vm, int child_cdrom) 1051 { 1052 void *hva = NULL; 1053 unsigned int i; 1054 1055 if (!strlen(vm->vm_params.vmc_cdrom)) 1056 return (0); 1057 1058 vioscsi = calloc(1, sizeof(struct vioscsi_dev)); 1059 if (vioscsi == NULL) { 1060 log_warn("%s: calloc failure allocating vioscsi", __progname); 1061 return (-1); 1062 } 1063 1064 log_debug("%s: receiving vioscsi", __func__); 1065 1066 if (atomicio(read, fd, vioscsi, sizeof(struct vioscsi_dev)) != 1067 sizeof(struct vioscsi_dev)) { 1068 log_warnx("%s: error reading vioscsi from fd", __func__); 1069 return (-1); 1070 } 1071 1072 if (pci_set_bar_fn(vioscsi->pci_id, 0, vioscsi_io, vioscsi)) { 1073 log_warnx("%s: can't set bar fn for vmm control device", 1074 __progname); 1075 return (-1); 1076 } 1077 1078 vioscsi->vm_id = vm->vm_params.vmc_params.vcp_id; 1079 vioscsi->irq = pci_get_dev_irq(vioscsi->pci_id); 1080 1081 /* vioscsi uses 3 virtqueues. */ 1082 for (i = 0; i < 3; i++) { 1083 hva = hvaddr_mem(vioscsi->vq[i].q_gpa, 1084 vring_size(VIOSCSI_QUEUE_SIZE)); 1085 if (hva == NULL) 1086 fatal("failed to restore vioscsi virtqueue"); 1087 vioscsi->vq[i].q_hva = hva; 1088 } 1089 1090 return (0); 1091 } 1092 1093 int 1094 virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, 1095 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 1096 { 1097 struct virtio_dev *dev; 1098 int ret; 1099 1100 SLIST_INIT(&virtio_devs); 1101 1102 if ((ret = viornd_restore(fd, vm)) == -1) 1103 return (ret); 1104 1105 if ((ret = vioblk_restore(fd, vm, child_disks)) == -1) 1106 return (ret); 1107 1108 if ((ret = vioscsi_restore(fd, vm, child_cdrom)) == -1) 1109 return (ret); 1110 1111 if ((ret = vionet_restore(fd, vm, child_taps)) == -1) 1112 return (ret); 1113 1114 if ((ret = vmmci_restore(fd, vm->vm_params.vmc_params.vcp_id)) == -1) 1115 return (ret); 1116 1117 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 1118 if (virtio_dev_launch(vm, dev) != 0) 1119 fatalx("%s: failed to restore virtio dev", __func__); 1120 } 1121 1122 return (0); 1123 } 1124 1125 int 1126 viornd_dump(int fd) 1127 { 1128 log_debug("%s: sending viornd", __func__); 1129 1130 viornd.vq[0].q_hva = NULL; 1131 1132 if (atomicio(vwrite, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) { 1133 log_warnx("%s: error writing viornd to fd", __func__); 1134 return (-1); 1135 } 1136 return (0); 1137 } 1138 1139 int 1140 vmmci_dump(int fd) 1141 { 1142 log_debug("%s: sending vmmci", __func__); 1143 1144 if (atomicio(vwrite, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) { 1145 log_warnx("%s: error writing vmmci to fd", __func__); 1146 return (-1); 1147 } 1148 return (0); 1149 } 1150 1151 int 1152 vionet_dump(int fd) 1153 { 1154 struct virtio_dev *dev, temp; 1155 struct viodev_msg msg; 1156 struct imsg imsg; 1157 struct imsgbuf *ibuf = NULL; 1158 size_t sz; 1159 int ret; 1160 1161 log_debug("%s: dumping vionet", __func__); 1162 1163 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 1164 if (dev->dev_type != VMD_DEVTYPE_NET) 1165 continue; 1166 1167 memset(&msg, 0, sizeof(msg)); 1168 memset(&imsg, 0, sizeof(imsg)); 1169 1170 ibuf = &dev->sync_iev.ibuf; 1171 msg.type = VIODEV_MSG_DUMP; 1172 1173 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, 1174 sizeof(msg)); 1175 if (ret == -1) { 1176 log_warnx("%s: failed requesting dump of vionet[%d]", 1177 __func__, dev->vionet.idx); 1178 return (-1); 1179 } 1180 if (imsgbuf_flush(ibuf) == -1) { 1181 log_warnx("%s: imsgbuf_flush", __func__); 1182 return (-1); 1183 } 1184 1185 sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp)); 1186 if (sz != sizeof(temp)) { 1187 log_warnx("%s: failed to dump vionet[%d]", __func__, 1188 dev->vionet.idx); 1189 return (-1); 1190 } 1191 1192 /* Clear volatile state. Will reinitialize on restore. */ 1193 temp.vionet.vq[RXQ].q_hva = NULL; 1194 temp.vionet.vq[TXQ].q_hva = NULL; 1195 temp.async_fd = -1; 1196 temp.sync_fd = -1; 1197 memset(&temp.async_iev, 0, sizeof(temp.async_iev)); 1198 memset(&temp.sync_iev, 0, sizeof(temp.sync_iev)); 1199 1200 if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) { 1201 log_warnx("%s: error writing vionet to fd", __func__); 1202 return (-1); 1203 } 1204 } 1205 1206 return (0); 1207 } 1208 1209 int 1210 vioblk_dump(int fd) 1211 { 1212 struct virtio_dev *dev, temp; 1213 struct viodev_msg msg; 1214 struct imsg imsg; 1215 struct imsgbuf *ibuf = NULL; 1216 size_t sz; 1217 int ret; 1218 1219 log_debug("%s: dumping vioblk", __func__); 1220 1221 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 1222 if (dev->dev_type != VMD_DEVTYPE_DISK) 1223 continue; 1224 1225 memset(&msg, 0, sizeof(msg)); 1226 memset(&imsg, 0, sizeof(imsg)); 1227 1228 ibuf = &dev->sync_iev.ibuf; 1229 msg.type = VIODEV_MSG_DUMP; 1230 1231 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, 1232 sizeof(msg)); 1233 if (ret == -1) { 1234 log_warnx("%s: failed requesting dump of vioblk[%d]", 1235 __func__, dev->vioblk.idx); 1236 return (-1); 1237 } 1238 if (imsgbuf_flush(ibuf) == -1) { 1239 log_warnx("%s: imsgbuf_flush", __func__); 1240 return (-1); 1241 } 1242 1243 1244 sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp)); 1245 if (sz != sizeof(temp)) { 1246 log_warnx("%s: failed to dump vioblk[%d]", __func__, 1247 dev->vioblk.idx); 1248 return (-1); 1249 } 1250 1251 /* Clear volatile state. Will reinitialize on restore. */ 1252 temp.vioblk.vq[0].q_hva = NULL; 1253 temp.async_fd = -1; 1254 temp.sync_fd = -1; 1255 memset(&temp.async_iev, 0, sizeof(temp.async_iev)); 1256 memset(&temp.sync_iev, 0, sizeof(temp.sync_iev)); 1257 1258 if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) { 1259 log_warnx("%s: error writing vioblk to fd", __func__); 1260 return (-1); 1261 } 1262 } 1263 1264 return (0); 1265 } 1266 1267 int 1268 vioscsi_dump(int fd) 1269 { 1270 unsigned int i; 1271 1272 if (vioscsi == NULL) 1273 return (0); 1274 1275 log_debug("%s: sending vioscsi", __func__); 1276 1277 for (i = 0; i < 3; i++) 1278 vioscsi->vq[i].q_hva = NULL; 1279 1280 if (atomicio(vwrite, fd, vioscsi, sizeof(struct vioscsi_dev)) != 1281 sizeof(struct vioscsi_dev)) { 1282 log_warnx("%s: error writing vioscsi to fd", __func__); 1283 return (-1); 1284 } 1285 return (0); 1286 } 1287 1288 int 1289 virtio_dump(int fd) 1290 { 1291 int ret; 1292 1293 if ((ret = viornd_dump(fd)) == -1) 1294 return ret; 1295 1296 if ((ret = vioblk_dump(fd)) == -1) 1297 return ret; 1298 1299 if ((ret = vioscsi_dump(fd)) == -1) 1300 return ret; 1301 1302 if ((ret = vionet_dump(fd)) == -1) 1303 return ret; 1304 1305 if ((ret = vmmci_dump(fd)) == -1) 1306 return ret; 1307 1308 return (0); 1309 } 1310 1311 void virtio_broadcast_imsg(struct vmd_vm *vm, uint16_t type, void *data, 1312 uint16_t datalen) 1313 { 1314 struct virtio_dev *dev; 1315 int ret; 1316 1317 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 1318 ret = imsg_compose_event(&dev->async_iev, type, 0, 0, -1, data, 1319 datalen); 1320 if (ret == -1) { 1321 log_warnx("%s: failed to broadcast imsg type %u", 1322 __func__, type); 1323 } 1324 } 1325 1326 } 1327 1328 void 1329 virtio_stop(struct vmd_vm *vm) 1330 { 1331 return virtio_broadcast_imsg(vm, IMSG_VMDOP_PAUSE_VM, NULL, 0); 1332 } 1333 1334 void 1335 virtio_start(struct vmd_vm *vm) 1336 { 1337 return virtio_broadcast_imsg(vm, IMSG_VMDOP_UNPAUSE_VM, NULL, 0); 1338 } 1339 1340 /* 1341 * Fork+exec a child virtio device. Returns 0 on success. 1342 */ 1343 static int 1344 virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev) 1345 { 1346 char *nargv[12], num[32], vmm_fd[32], vm_name[VM_NAME_MAX], t[2]; 1347 pid_t dev_pid; 1348 int sync_fds[2], async_fds[2], ret = 0; 1349 size_t i, sz = 0; 1350 struct viodev_msg msg; 1351 struct virtio_dev *dev_entry; 1352 struct imsg imsg; 1353 struct imsgev *iev = &dev->sync_iev; 1354 1355 switch (dev->dev_type) { 1356 case VMD_DEVTYPE_NET: 1357 log_debug("%s: launching vionet%d", 1358 vm->vm_params.vmc_params.vcp_name, dev->vionet.idx); 1359 break; 1360 case VMD_DEVTYPE_DISK: 1361 log_debug("%s: launching vioblk%d", 1362 vm->vm_params.vmc_params.vcp_name, dev->vioblk.idx); 1363 break; 1364 /* NOTREACHED */ 1365 default: 1366 log_warn("%s: invalid device type", __func__); 1367 return (EINVAL); 1368 } 1369 1370 /* We need two channels: one synchronous (IO reads) and one async. */ 1371 if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, PF_UNSPEC, 1372 sync_fds) == -1) { 1373 log_warn("failed to create socketpair"); 1374 return (errno); 1375 } 1376 if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, PF_UNSPEC, 1377 async_fds) == -1) { 1378 log_warn("failed to create async socketpair"); 1379 return (errno); 1380 } 1381 1382 /* Fork... */ 1383 dev_pid = fork(); 1384 if (dev_pid == -1) { 1385 ret = errno; 1386 log_warn("%s: fork failed", __func__); 1387 goto err; 1388 } 1389 1390 if (dev_pid > 0) { 1391 /* Parent */ 1392 close_fd(sync_fds[1]); 1393 close_fd(async_fds[1]); 1394 1395 /* Save the child's pid to help with cleanup. */ 1396 dev->dev_pid = dev_pid; 1397 1398 /* Set the channel fds to the child's before sending. */ 1399 dev->sync_fd = sync_fds[1]; 1400 dev->async_fd = async_fds[1]; 1401 1402 /* 1. Send over our configured device. */ 1403 log_debug("%s: sending '%c' type device struct", __func__, 1404 dev->dev_type); 1405 sz = atomicio(vwrite, sync_fds[0], dev, sizeof(*dev)); 1406 if (sz != sizeof(*dev)) { 1407 log_warnx("%s: failed to send device", __func__); 1408 ret = EIO; 1409 goto err; 1410 } 1411 1412 /* Close data fds. Only the child device needs them now. */ 1413 if (virtio_dev_closefds(dev) == -1) { 1414 log_warnx("%s: failed to close device data fds", 1415 __func__); 1416 goto err; 1417 } 1418 1419 /* 2. Send over details on the VM (including memory fds). */ 1420 log_debug("%s: sending vm message for '%s'", __func__, 1421 vm->vm_params.vmc_params.vcp_name); 1422 sz = atomicio(vwrite, sync_fds[0], vm, sizeof(*vm)); 1423 if (sz != sizeof(*vm)) { 1424 log_warnx("%s: failed to send vm details", __func__); 1425 ret = EIO; 1426 goto err; 1427 } 1428 1429 /* 1430 * Initialize our imsg channel to the child device. The initial 1431 * communication will be synchronous. We expect the child to 1432 * report itself "ready" to confirm the launch was a success. 1433 */ 1434 if (imsgbuf_init(&iev->ibuf, sync_fds[0]) == -1) { 1435 log_warn("%s: failed to init imsgbuf", __func__); 1436 goto err; 1437 } 1438 imsgbuf_allow_fdpass(&iev->ibuf); 1439 ret = imsgbuf_read_one(&iev->ibuf, &imsg); 1440 if (ret == 0 || ret == -1) { 1441 log_warnx("%s: failed to receive ready message from " 1442 "'%c' type device", __func__, dev->dev_type); 1443 ret = EIO; 1444 goto err; 1445 } 1446 ret = 0; 1447 1448 IMSG_SIZE_CHECK(&imsg, &msg); 1449 memcpy(&msg, imsg.data, sizeof(msg)); 1450 imsg_free(&imsg); 1451 1452 if (msg.type != VIODEV_MSG_READY) { 1453 log_warnx("%s: expected ready message, got type %d", 1454 __func__, msg.type); 1455 ret = EINVAL; 1456 goto err; 1457 } 1458 log_debug("%s: device reports ready via sync channel", 1459 __func__); 1460 1461 /* 1462 * Wire in the async event handling, but after reverting back 1463 * to the parent's fd's. 1464 */ 1465 dev->sync_fd = sync_fds[0]; 1466 dev->async_fd = async_fds[0]; 1467 vm_device_pipe(dev, virtio_dispatch_dev, NULL); 1468 } else { 1469 /* Child */ 1470 close_fd(async_fds[0]); 1471 close_fd(sync_fds[0]); 1472 1473 /* Close pty. Virtio devices do not need it. */ 1474 close_fd(vm->vm_tty); 1475 vm->vm_tty = -1; 1476 1477 if (vm->vm_cdrom != -1) { 1478 close_fd(vm->vm_cdrom); 1479 vm->vm_cdrom = -1; 1480 } 1481 1482 /* Keep data file descriptors open after exec. */ 1483 SLIST_FOREACH(dev_entry, &virtio_devs, dev_next) { 1484 if (dev_entry == dev) 1485 continue; 1486 if (virtio_dev_closefds(dev_entry) == -1) 1487 fatalx("unable to close other virtio devs"); 1488 } 1489 1490 memset(num, 0, sizeof(num)); 1491 snprintf(num, sizeof(num), "%d", sync_fds[1]); 1492 memset(vmm_fd, 0, sizeof(vmm_fd)); 1493 snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd); 1494 memset(vm_name, 0, sizeof(vm_name)); 1495 snprintf(vm_name, sizeof(vm_name), "%s", 1496 vm->vm_params.vmc_params.vcp_name); 1497 1498 t[0] = dev->dev_type; 1499 t[1] = '\0'; 1500 1501 i = 0; 1502 nargv[i++] = env->argv0; 1503 nargv[i++] = "-X"; 1504 nargv[i++] = num; 1505 nargv[i++] = "-t"; 1506 nargv[i++] = t; 1507 nargv[i++] = "-i"; 1508 nargv[i++] = vmm_fd; 1509 nargv[i++] = "-p"; 1510 nargv[i++] = vm_name; 1511 if (env->vmd_debug) 1512 nargv[i++] = "-d"; 1513 if (env->vmd_verbose == 1) 1514 nargv[i++] = "-v"; 1515 else if (env->vmd_verbose > 1) 1516 nargv[i++] = "-vv"; 1517 nargv[i++] = NULL; 1518 if (i > sizeof(nargv) / sizeof(nargv[0])) 1519 fatalx("%s: nargv overflow", __func__); 1520 1521 /* Control resumes in vmd.c:main(). */ 1522 execvp(nargv[0], nargv); 1523 1524 ret = errno; 1525 log_warn("%s: failed to exec device", __func__); 1526 _exit(ret); 1527 /* NOTREACHED */ 1528 } 1529 1530 return (ret); 1531 1532 err: 1533 close_fd(sync_fds[0]); 1534 close_fd(sync_fds[1]); 1535 close_fd(async_fds[0]); 1536 close_fd(async_fds[1]); 1537 return (ret); 1538 } 1539 1540 /* 1541 * Initialize an async imsg channel for a virtio device. 1542 */ 1543 int 1544 vm_device_pipe(struct virtio_dev *dev, void (*cb)(int, short, void *), 1545 struct event_base *ev_base) 1546 { 1547 struct imsgev *iev = &dev->async_iev; 1548 int fd = dev->async_fd; 1549 1550 log_debug("%s: initializing '%c' device pipe (fd=%d)", __func__, 1551 dev->dev_type, fd); 1552 1553 if (imsgbuf_init(&iev->ibuf, fd) == -1) 1554 fatal("imsgbuf_init"); 1555 imsgbuf_allow_fdpass(&iev->ibuf); 1556 iev->handler = cb; 1557 iev->data = dev; 1558 iev->events = EV_READ; 1559 imsg_event_add2(iev, ev_base); 1560 1561 return (0); 1562 } 1563 1564 void 1565 virtio_dispatch_dev(int fd, short event, void *arg) 1566 { 1567 struct virtio_dev *dev = (struct virtio_dev*)arg; 1568 struct imsgev *iev = &dev->async_iev; 1569 struct imsgbuf *ibuf = &iev->ibuf; 1570 struct imsg imsg; 1571 struct viodev_msg msg; 1572 ssize_t n = 0; 1573 1574 if (event & EV_READ) { 1575 if ((n = imsgbuf_read(ibuf)) == -1) 1576 fatal("%s: imsgbuf_read", __func__); 1577 if (n == 0) { 1578 /* this pipe is dead, so remove the event handler */ 1579 log_debug("%s: pipe dead (EV_READ)", __func__); 1580 event_del(&iev->ev); 1581 event_loopexit(NULL); 1582 return; 1583 } 1584 } 1585 1586 if (event & EV_WRITE) { 1587 if (imsgbuf_write(ibuf) == -1) { 1588 if (errno == EPIPE) { 1589 /* this pipe is dead, remove the handler */ 1590 log_debug("%s: pipe dead (EV_WRITE)", __func__); 1591 event_del(&iev->ev); 1592 event_loopexit(NULL); 1593 return; 1594 } 1595 fatal("%s: imsgbuf_write", __func__); 1596 } 1597 } 1598 1599 for (;;) { 1600 if ((n = imsg_get(ibuf, &imsg)) == -1) 1601 fatal("%s: imsg_get", __func__); 1602 if (n == 0) 1603 break; 1604 1605 switch (imsg.hdr.type) { 1606 case IMSG_DEVOP_MSG: 1607 IMSG_SIZE_CHECK(&imsg, &msg); 1608 memcpy(&msg, imsg.data, sizeof(msg)); 1609 handle_dev_msg(&msg, dev); 1610 break; 1611 default: 1612 log_warnx("%s: got non devop imsg %d", __func__, 1613 imsg.hdr.type); 1614 break; 1615 } 1616 imsg_free(&imsg); 1617 } 1618 imsg_event_add(iev); 1619 } 1620 1621 1622 static int 1623 handle_dev_msg(struct viodev_msg *msg, struct virtio_dev *gdev) 1624 { 1625 uint32_t vm_id = gdev->vm_id; 1626 int irq = gdev->irq; 1627 1628 switch (msg->type) { 1629 case VIODEV_MSG_KICK: 1630 if (msg->state == INTR_STATE_ASSERT) 1631 vcpu_assert_irq(vm_id, msg->vcpu, irq); 1632 else if (msg->state == INTR_STATE_DEASSERT) 1633 vcpu_deassert_irq(vm_id, msg->vcpu, irq); 1634 break; 1635 case VIODEV_MSG_READY: 1636 log_debug("%s: device reports ready", __func__); 1637 break; 1638 case VIODEV_MSG_ERROR: 1639 log_warnx("%s: device reported error", __func__); 1640 break; 1641 case VIODEV_MSG_INVALID: 1642 case VIODEV_MSG_IO_READ: 1643 case VIODEV_MSG_IO_WRITE: 1644 /* FALLTHROUGH */ 1645 default: 1646 log_warnx("%s: unsupported device message type %d", __func__, 1647 msg->type); 1648 return (1); 1649 } 1650 1651 return (0); 1652 }; 1653 1654 /* 1655 * Called by the VM process while processing IO from the VCPU thread. 1656 * 1657 * N.b. Since the VCPU thread calls this function, we cannot mutate the event 1658 * system. All ipc messages must be sent manually and cannot be queued for 1659 * the event loop to push them. (We need to perform a synchronous read, so 1660 * this isn't really a big deal.) 1661 */ 1662 int 1663 virtio_pci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, 1664 void *cookie, uint8_t sz) 1665 { 1666 struct virtio_dev *dev = (struct virtio_dev *)cookie; 1667 struct imsgbuf *ibuf = &dev->sync_iev.ibuf; 1668 struct imsg imsg; 1669 struct viodev_msg msg; 1670 int ret = 0; 1671 1672 memset(&msg, 0, sizeof(msg)); 1673 msg.reg = reg; 1674 msg.io_sz = sz; 1675 1676 if (dir == 0) { 1677 msg.type = VIODEV_MSG_IO_WRITE; 1678 msg.data = *data; 1679 msg.data_valid = 1; 1680 } else 1681 msg.type = VIODEV_MSG_IO_READ; 1682 1683 if (msg.type == VIODEV_MSG_IO_WRITE) { 1684 /* 1685 * Write request. No reply expected. 1686 */ 1687 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, 1688 sizeof(msg)); 1689 if (ret == -1) { 1690 log_warn("%s: failed to send async io event to virtio" 1691 " device", __func__); 1692 return (ret); 1693 } 1694 if (imsgbuf_flush(ibuf) == -1) { 1695 log_warnx("%s: imsgbuf_flush (write)", __func__); 1696 return (-1); 1697 } 1698 } else { 1699 /* 1700 * Read request. Requires waiting for a reply. 1701 */ 1702 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, 1703 sizeof(msg)); 1704 if (ret == -1) { 1705 log_warnx("%s: failed to send sync io event to virtio" 1706 " device", __func__); 1707 return (ret); 1708 } 1709 if (imsgbuf_flush(ibuf) == -1) { 1710 log_warnx("%s: imsgbuf_flush (read)", __func__); 1711 return (-1); 1712 } 1713 1714 /* Read our reply. */ 1715 ret = imsgbuf_read_one(ibuf, &imsg); 1716 if (ret == 0 || ret == -1) { 1717 log_warn("%s: imsgbuf_read (n=%d)", __func__, ret); 1718 return (-1); 1719 } 1720 IMSG_SIZE_CHECK(&imsg, &msg); 1721 memcpy(&msg, imsg.data, sizeof(msg)); 1722 imsg_free(&imsg); 1723 1724 if (msg.type == VIODEV_MSG_IO_READ && msg.data_valid) { 1725 #if DEBUG 1726 log_debug("%s: got sync read response (reg=%s)", 1727 __func__, virtio_reg_name(msg.reg)); 1728 #endif /* DEBUG */ 1729 *data = msg.data; 1730 /* 1731 * It's possible we're asked to {de,}assert after the 1732 * device performs a register read. 1733 */ 1734 if (msg.state == INTR_STATE_ASSERT) 1735 vcpu_assert_irq(dev->vm_id, msg.vcpu, msg.irq); 1736 else if (msg.state == INTR_STATE_DEASSERT) 1737 vcpu_deassert_irq(dev->vm_id, msg.vcpu, msg.irq); 1738 } else { 1739 log_warnx("%s: expected IO_READ, got %d", __func__, 1740 msg.type); 1741 return (-1); 1742 } 1743 } 1744 1745 return (0); 1746 } 1747 1748 void 1749 virtio_assert_irq(struct virtio_dev *dev, int vcpu) 1750 { 1751 struct viodev_msg msg; 1752 int ret; 1753 1754 memset(&msg, 0, sizeof(msg)); 1755 msg.irq = dev->irq; 1756 msg.vcpu = vcpu; 1757 msg.type = VIODEV_MSG_KICK; 1758 msg.state = INTR_STATE_ASSERT; 1759 1760 ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1, 1761 &msg, sizeof(msg)); 1762 if (ret == -1) 1763 log_warnx("%s: failed to assert irq %d", __func__, dev->irq); 1764 } 1765 1766 void 1767 virtio_deassert_irq(struct virtio_dev *dev, int vcpu) 1768 { 1769 struct viodev_msg msg; 1770 int ret; 1771 1772 memset(&msg, 0, sizeof(msg)); 1773 msg.irq = dev->irq; 1774 msg.vcpu = vcpu; 1775 msg.type = VIODEV_MSG_KICK; 1776 msg.state = INTR_STATE_DEASSERT; 1777 1778 ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1, 1779 &msg, sizeof(msg)); 1780 if (ret == -1) 1781 log_warnx("%s: failed to deassert irq %d", __func__, dev->irq); 1782 } 1783 1784 /* 1785 * Close all underlying file descriptors for a given virtio device. 1786 */ 1787 static int 1788 virtio_dev_closefds(struct virtio_dev *dev) 1789 { 1790 size_t i; 1791 1792 switch (dev->dev_type) { 1793 case VMD_DEVTYPE_DISK: 1794 for (i = 0; i < dev->vioblk.ndisk_fd; i++) { 1795 close_fd(dev->vioblk.disk_fd[i]); 1796 dev->vioblk.disk_fd[i] = -1; 1797 } 1798 break; 1799 case VMD_DEVTYPE_NET: 1800 close_fd(dev->vionet.data_fd); 1801 dev->vionet.data_fd = -1; 1802 break; 1803 default: 1804 log_warnx("%s: invalid device type", __func__); 1805 return (-1); 1806 } 1807 1808 close_fd(dev->async_fd); 1809 dev->async_fd = -1; 1810 close_fd(dev->sync_fd); 1811 dev->sync_fd = -1; 1812 1813 return (0); 1814 } 1815