1 /* $OpenBSD: virtio.c,v 1.103 2023/05/13 23:15:28 dv Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/param.h> /* PAGE_SIZE */ 20 #include <sys/socket.h> 21 #include <sys/wait.h> 22 23 #include <machine/vmmvar.h> 24 #include <dev/pci/pcireg.h> 25 #include <dev/pci/pcidevs.h> 26 #include <dev/pv/virtioreg.h> 27 #include <dev/pci/virtio_pcireg.h> 28 #include <dev/pv/vioblkreg.h> 29 #include <dev/pv/vioscsireg.h> 30 31 #include <net/if.h> 32 #include <netinet/in.h> 33 #include <netinet/if_ether.h> 34 #include <netinet/ip.h> 35 36 #include <errno.h> 37 #include <event.h> 38 #include <fcntl.h> 39 #include <poll.h> 40 #include <stddef.h> 41 #include <stdlib.h> 42 #include <string.h> 43 #include <unistd.h> 44 45 #include "atomicio.h" 46 #include "pci.h" 47 #include "vioscsi.h" 48 #include "virtio.h" 49 #include "vmd.h" 50 #include "vmm.h" 51 52 extern struct vmd *env; 53 extern char *__progname; 54 55 struct viornd_dev viornd; 56 struct vioscsi_dev *vioscsi; 57 struct vmmci_dev vmmci; 58 59 /* Devices emulated in subprocesses are inserted into this list. */ 60 SLIST_HEAD(virtio_dev_head, virtio_dev) virtio_devs; 61 62 #define MAXPHYS (64 * 1024) /* max raw I/O transfer size */ 63 64 #define VIRTIO_NET_F_MAC (1<<5) 65 66 #define VMMCI_F_TIMESYNC (1<<0) 67 #define VMMCI_F_ACK (1<<1) 68 #define VMMCI_F_SYNCRTC (1<<2) 69 70 #define RXQ 0 71 #define TXQ 1 72 73 static int virtio_dev_launch(struct vmd_vm *, struct virtio_dev *); 74 static void virtio_dispatch_dev(int, short, void *); 75 static int handle_dev_msg(struct viodev_msg *, struct virtio_dev *); 76 77 const char * 78 virtio_reg_name(uint8_t reg) 79 { 80 switch (reg) { 81 case VIRTIO_CONFIG_DEVICE_FEATURES: return "device feature"; 82 case VIRTIO_CONFIG_GUEST_FEATURES: return "guest feature"; 83 case VIRTIO_CONFIG_QUEUE_PFN: return "queue address"; 84 case VIRTIO_CONFIG_QUEUE_SIZE: return "queue size"; 85 case VIRTIO_CONFIG_QUEUE_SELECT: return "queue select"; 86 case VIRTIO_CONFIG_QUEUE_NOTIFY: return "queue notify"; 87 case VIRTIO_CONFIG_DEVICE_STATUS: return "device status"; 88 case VIRTIO_CONFIG_ISR_STATUS: return "isr status"; 89 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI...VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3: 90 return "device config 0"; 91 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: 92 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5: 93 return "device config 1"; 94 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: return "device config 2"; 95 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: return "device config 3"; 96 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: return "device config 4"; 97 default: return "unknown"; 98 } 99 } 100 101 uint32_t 102 vring_size(uint32_t vq_size) 103 { 104 uint32_t allocsize1, allocsize2; 105 106 /* allocsize1: descriptor table + avail ring + pad */ 107 allocsize1 = VIRTQUEUE_ALIGN(sizeof(struct vring_desc) * vq_size 108 + sizeof(uint16_t) * (2 + vq_size)); 109 /* allocsize2: used ring + pad */ 110 allocsize2 = VIRTQUEUE_ALIGN(sizeof(uint16_t) * 2 111 + sizeof(struct vring_used_elem) * vq_size); 112 113 return allocsize1 + allocsize2; 114 } 115 116 /* Update queue select */ 117 void 118 viornd_update_qs(void) 119 { 120 struct virtio_vq_info *vq_info; 121 122 /* Invalid queue? */ 123 if (viornd.cfg.queue_select > 0) { 124 viornd.cfg.queue_size = 0; 125 return; 126 } 127 128 vq_info = &viornd.vq[viornd.cfg.queue_select]; 129 130 /* Update queue pfn/size based on queue select */ 131 viornd.cfg.queue_pfn = vq_info->q_gpa >> 12; 132 viornd.cfg.queue_size = vq_info->qs; 133 } 134 135 /* Update queue address */ 136 void 137 viornd_update_qa(void) 138 { 139 struct virtio_vq_info *vq_info; 140 void *hva = NULL; 141 142 /* Invalid queue? */ 143 if (viornd.cfg.queue_select > 0) 144 return; 145 146 vq_info = &viornd.vq[viornd.cfg.queue_select]; 147 vq_info->q_gpa = (uint64_t)viornd.cfg.queue_pfn * VIRTIO_PAGE_SIZE; 148 149 hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIORND_QUEUE_SIZE)); 150 if (hva == NULL) 151 fatalx("viornd_update_qa"); 152 vq_info->q_hva = hva; 153 } 154 155 int 156 viornd_notifyq(void) 157 { 158 size_t sz; 159 int dxx, ret; 160 uint16_t aidx, uidx; 161 char *vr, *rnd_data; 162 struct vring_desc *desc; 163 struct vring_avail *avail; 164 struct vring_used *used; 165 struct virtio_vq_info *vq_info; 166 167 ret = 0; 168 169 /* Invalid queue? */ 170 if (viornd.cfg.queue_notify > 0) 171 return (0); 172 173 vq_info = &viornd.vq[viornd.cfg.queue_notify]; 174 vr = vq_info->q_hva; 175 if (vr == NULL) 176 fatalx("%s: null vring", __func__); 177 178 desc = (struct vring_desc *)(vr); 179 avail = (struct vring_avail *)(vr + vq_info->vq_availoffset); 180 used = (struct vring_used *)(vr + vq_info->vq_usedoffset); 181 182 aidx = avail->idx & VIORND_QUEUE_MASK; 183 uidx = used->idx & VIORND_QUEUE_MASK; 184 185 dxx = avail->ring[aidx] & VIORND_QUEUE_MASK; 186 187 sz = desc[dxx].len; 188 if (sz > MAXPHYS) 189 fatalx("viornd descriptor size too large (%zu)", sz); 190 191 rnd_data = malloc(sz); 192 193 if (rnd_data != NULL) { 194 arc4random_buf(rnd_data, sz); 195 if (write_mem(desc[dxx].addr, rnd_data, sz)) { 196 log_warnx("viornd: can't write random data @ " 197 "0x%llx", 198 desc[dxx].addr); 199 } else { 200 /* ret == 1 -> interrupt needed */ 201 /* XXX check VIRTIO_F_NO_INTR */ 202 ret = 1; 203 viornd.cfg.isr_status = 1; 204 used->ring[uidx].id = dxx; 205 used->ring[uidx].len = sz; 206 __sync_synchronize(); 207 used->idx++; 208 } 209 free(rnd_data); 210 } else 211 fatal("memory allocation error for viornd data"); 212 213 return (ret); 214 } 215 216 int 217 virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, 218 void *unused, uint8_t sz) 219 { 220 *intr = 0xFF; 221 222 if (dir == 0) { 223 switch (reg) { 224 case VIRTIO_CONFIG_DEVICE_FEATURES: 225 case VIRTIO_CONFIG_QUEUE_SIZE: 226 case VIRTIO_CONFIG_ISR_STATUS: 227 log_warnx("%s: illegal write %x to %s", 228 __progname, *data, virtio_reg_name(reg)); 229 break; 230 case VIRTIO_CONFIG_GUEST_FEATURES: 231 viornd.cfg.guest_feature = *data; 232 break; 233 case VIRTIO_CONFIG_QUEUE_PFN: 234 viornd.cfg.queue_pfn = *data; 235 viornd_update_qa(); 236 break; 237 case VIRTIO_CONFIG_QUEUE_SELECT: 238 viornd.cfg.queue_select = *data; 239 viornd_update_qs(); 240 break; 241 case VIRTIO_CONFIG_QUEUE_NOTIFY: 242 viornd.cfg.queue_notify = *data; 243 if (viornd_notifyq()) 244 *intr = 1; 245 break; 246 case VIRTIO_CONFIG_DEVICE_STATUS: 247 viornd.cfg.device_status = *data; 248 break; 249 } 250 } else { 251 switch (reg) { 252 case VIRTIO_CONFIG_DEVICE_FEATURES: 253 *data = viornd.cfg.device_feature; 254 break; 255 case VIRTIO_CONFIG_GUEST_FEATURES: 256 *data = viornd.cfg.guest_feature; 257 break; 258 case VIRTIO_CONFIG_QUEUE_PFN: 259 *data = viornd.cfg.queue_pfn; 260 break; 261 case VIRTIO_CONFIG_QUEUE_SIZE: 262 *data = viornd.cfg.queue_size; 263 break; 264 case VIRTIO_CONFIG_QUEUE_SELECT: 265 *data = viornd.cfg.queue_select; 266 break; 267 case VIRTIO_CONFIG_QUEUE_NOTIFY: 268 *data = viornd.cfg.queue_notify; 269 break; 270 case VIRTIO_CONFIG_DEVICE_STATUS: 271 *data = viornd.cfg.device_status; 272 break; 273 case VIRTIO_CONFIG_ISR_STATUS: 274 *data = viornd.cfg.isr_status; 275 viornd.cfg.isr_status = 0; 276 vcpu_deassert_pic_irq(viornd.vm_id, 0, viornd.irq); 277 break; 278 } 279 } 280 return (0); 281 } 282 283 int 284 vmmci_ctl(unsigned int cmd) 285 { 286 struct timeval tv = { 0, 0 }; 287 288 if ((vmmci.cfg.device_status & 289 VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0) 290 return (-1); 291 292 if (cmd == vmmci.cmd) 293 return (0); 294 295 switch (cmd) { 296 case VMMCI_NONE: 297 break; 298 case VMMCI_SHUTDOWN: 299 case VMMCI_REBOOT: 300 /* Update command */ 301 vmmci.cmd = cmd; 302 303 /* 304 * vmm VMs do not support powerdown, send a reboot request 305 * instead and turn it off after the triple fault. 306 */ 307 if (cmd == VMMCI_SHUTDOWN) 308 cmd = VMMCI_REBOOT; 309 310 /* Trigger interrupt */ 311 vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE; 312 vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq); 313 314 /* Add ACK timeout */ 315 tv.tv_sec = VMMCI_TIMEOUT; 316 evtimer_add(&vmmci.timeout, &tv); 317 break; 318 case VMMCI_SYNCRTC: 319 if (vmmci.cfg.guest_feature & VMMCI_F_SYNCRTC) { 320 /* RTC updated, request guest VM resync of its RTC */ 321 vmmci.cmd = cmd; 322 323 vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE; 324 vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq); 325 } else { 326 log_debug("%s: RTC sync skipped (guest does not " 327 "support RTC sync)\n", __func__); 328 } 329 break; 330 default: 331 fatalx("invalid vmmci command: %d", cmd); 332 } 333 334 return (0); 335 } 336 337 void 338 vmmci_ack(unsigned int cmd) 339 { 340 struct timeval tv = { 0, 0 }; 341 342 switch (cmd) { 343 case VMMCI_NONE: 344 break; 345 case VMMCI_SHUTDOWN: 346 /* 347 * The shutdown was requested by the VM if we don't have 348 * a pending shutdown request. In this case add a short 349 * timeout to give the VM a chance to reboot before the 350 * timer is expired. 351 */ 352 if (vmmci.cmd == 0) { 353 log_debug("%s: vm %u requested shutdown", __func__, 354 vmmci.vm_id); 355 tv.tv_sec = VMMCI_TIMEOUT; 356 evtimer_add(&vmmci.timeout, &tv); 357 return; 358 } 359 /* FALLTHROUGH */ 360 case VMMCI_REBOOT: 361 /* 362 * If the VM acknowledged our shutdown request, give it 363 * enough time to shutdown or reboot gracefully. This 364 * might take a considerable amount of time (running 365 * rc.shutdown on the VM), so increase the timeout before 366 * killing it forcefully. 367 */ 368 if (cmd == vmmci.cmd && 369 evtimer_pending(&vmmci.timeout, NULL)) { 370 log_debug("%s: vm %u acknowledged shutdown request", 371 __func__, vmmci.vm_id); 372 tv.tv_sec = VMMCI_SHUTDOWN_TIMEOUT; 373 evtimer_add(&vmmci.timeout, &tv); 374 } 375 break; 376 case VMMCI_SYNCRTC: 377 log_debug("%s: vm %u acknowledged RTC sync request", 378 __func__, vmmci.vm_id); 379 vmmci.cmd = VMMCI_NONE; 380 break; 381 default: 382 log_warnx("%s: illegal request %u", __func__, cmd); 383 break; 384 } 385 } 386 387 void 388 vmmci_timeout(int fd, short type, void *arg) 389 { 390 log_debug("%s: vm %u shutdown", __progname, vmmci.vm_id); 391 vm_shutdown(vmmci.cmd == VMMCI_REBOOT ? VMMCI_REBOOT : VMMCI_SHUTDOWN); 392 } 393 394 int 395 vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, 396 void *unused, uint8_t sz) 397 { 398 *intr = 0xFF; 399 400 if (dir == 0) { 401 switch (reg) { 402 case VIRTIO_CONFIG_DEVICE_FEATURES: 403 case VIRTIO_CONFIG_QUEUE_SIZE: 404 case VIRTIO_CONFIG_ISR_STATUS: 405 log_warnx("%s: illegal write %x to %s", 406 __progname, *data, virtio_reg_name(reg)); 407 break; 408 case VIRTIO_CONFIG_GUEST_FEATURES: 409 vmmci.cfg.guest_feature = *data; 410 break; 411 case VIRTIO_CONFIG_QUEUE_PFN: 412 vmmci.cfg.queue_pfn = *data; 413 break; 414 case VIRTIO_CONFIG_QUEUE_SELECT: 415 vmmci.cfg.queue_select = *data; 416 break; 417 case VIRTIO_CONFIG_QUEUE_NOTIFY: 418 vmmci.cfg.queue_notify = *data; 419 break; 420 case VIRTIO_CONFIG_DEVICE_STATUS: 421 vmmci.cfg.device_status = *data; 422 break; 423 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI: 424 vmmci_ack(*data); 425 break; 426 } 427 } else { 428 switch (reg) { 429 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI: 430 *data = vmmci.cmd; 431 break; 432 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: 433 /* Update time once when reading the first register */ 434 gettimeofday(&vmmci.time, NULL); 435 *data = (uint64_t)vmmci.time.tv_sec; 436 break; 437 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: 438 *data = (uint64_t)vmmci.time.tv_sec << 32; 439 break; 440 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: 441 *data = (uint64_t)vmmci.time.tv_usec; 442 break; 443 case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: 444 *data = (uint64_t)vmmci.time.tv_usec << 32; 445 break; 446 case VIRTIO_CONFIG_DEVICE_FEATURES: 447 *data = vmmci.cfg.device_feature; 448 break; 449 case VIRTIO_CONFIG_GUEST_FEATURES: 450 *data = vmmci.cfg.guest_feature; 451 break; 452 case VIRTIO_CONFIG_QUEUE_PFN: 453 *data = vmmci.cfg.queue_pfn; 454 break; 455 case VIRTIO_CONFIG_QUEUE_SIZE: 456 *data = vmmci.cfg.queue_size; 457 break; 458 case VIRTIO_CONFIG_QUEUE_SELECT: 459 *data = vmmci.cfg.queue_select; 460 break; 461 case VIRTIO_CONFIG_QUEUE_NOTIFY: 462 *data = vmmci.cfg.queue_notify; 463 break; 464 case VIRTIO_CONFIG_DEVICE_STATUS: 465 *data = vmmci.cfg.device_status; 466 break; 467 case VIRTIO_CONFIG_ISR_STATUS: 468 *data = vmmci.cfg.isr_status; 469 vmmci.cfg.isr_status = 0; 470 vcpu_deassert_pic_irq(vmmci.vm_id, 0, vmmci.irq); 471 break; 472 } 473 } 474 return (0); 475 } 476 477 int 478 virtio_get_base(int fd, char *path, size_t npath, int type, const char *dpath) 479 { 480 switch (type) { 481 case VMDF_RAW: 482 return 0; 483 case VMDF_QCOW2: 484 return virtio_qcow2_get_base(fd, path, npath, dpath); 485 } 486 log_warnx("%s: invalid disk format", __func__); 487 return -1; 488 } 489 490 void 491 virtio_init(struct vmd_vm *vm, int child_cdrom, 492 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 493 { 494 struct vmop_create_params *vmc = &vm->vm_params; 495 struct vm_create_params *vcp = &vmc->vmc_params; 496 struct virtio_dev *dev; 497 uint8_t id; 498 uint8_t i, j; 499 500 /* Virtio entropy device */ 501 if (pci_add_device(&id, PCI_VENDOR_QUMRANET, 502 PCI_PRODUCT_QUMRANET_VIO_RNG, PCI_CLASS_SYSTEM, 503 PCI_SUBCLASS_SYSTEM_MISC, 504 PCI_VENDOR_OPENBSD, 505 PCI_PRODUCT_VIRTIO_ENTROPY, 1, NULL)) { 506 log_warnx("%s: can't add PCI virtio rng device", 507 __progname); 508 return; 509 } 510 511 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_rnd_io, NULL)) { 512 log_warnx("%s: can't add bar for virtio rng device", 513 __progname); 514 return; 515 } 516 517 memset(&viornd, 0, sizeof(viornd)); 518 viornd.vq[0].qs = VIORND_QUEUE_SIZE; 519 viornd.vq[0].vq_availoffset = sizeof(struct vring_desc) * 520 VIORND_QUEUE_SIZE; 521 viornd.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN( 522 sizeof(struct vring_desc) * VIORND_QUEUE_SIZE 523 + sizeof(uint16_t) * (2 + VIORND_QUEUE_SIZE)); 524 viornd.pci_id = id; 525 viornd.irq = pci_get_dev_irq(id); 526 viornd.vm_id = vcp->vcp_id; 527 528 SLIST_INIT(&virtio_devs); 529 530 if (vmc->vmc_nnics > 0) { 531 for (i = 0; i < vmc->vmc_nnics; i++) { 532 dev = calloc(1, sizeof(struct virtio_dev)); 533 if (dev == NULL) { 534 log_warn("%s: calloc failure allocating vionet", 535 __progname); 536 return; 537 } 538 /* Virtio network */ 539 dev->dev_type = VMD_DEVTYPE_NET; 540 541 if (pci_add_device(&id, PCI_VENDOR_QUMRANET, 542 PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM, 543 PCI_SUBCLASS_SYSTEM_MISC, PCI_VENDOR_OPENBSD, 544 PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL)) { 545 log_warnx("%s: can't add PCI virtio net device", 546 __progname); 547 return; 548 } 549 dev->pci_id = id; 550 dev->sync_fd = -1; 551 dev->async_fd = -1; 552 dev->vm_id = vcp->vcp_id; 553 dev->vm_vmid = vm->vm_vmid; 554 dev->irq = pci_get_dev_irq(id); 555 556 /* The vionet pci bar function is called by the vcpu. */ 557 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io, 558 dev)) { 559 log_warnx("%s: can't add bar for virtio net " 560 "device", __progname); 561 return; 562 } 563 564 dev->vionet.vq[RXQ].qs = VIONET_QUEUE_SIZE; 565 dev->vionet.vq[RXQ].vq_availoffset = 566 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE; 567 dev->vionet.vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN( 568 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE 569 + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE)); 570 dev->vionet.vq[RXQ].last_avail = 0; 571 dev->vionet.vq[RXQ].notified_avail = 0; 572 573 dev->vionet.vq[TXQ].qs = VIONET_QUEUE_SIZE; 574 dev->vionet.vq[TXQ].vq_availoffset = 575 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE; 576 dev->vionet.vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN( 577 sizeof(struct vring_desc) * VIONET_QUEUE_SIZE 578 + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE)); 579 dev->vionet.vq[TXQ].last_avail = 0; 580 dev->vionet.vq[TXQ].notified_avail = 0; 581 582 dev->vionet.data_fd = child_taps[i]; 583 584 /* MAC address has been assigned by the parent */ 585 memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6); 586 dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC; 587 588 dev->vionet.lockedmac = 589 vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0; 590 dev->vionet.local = 591 vmc->vmc_ifflags[i] & VMIFF_LOCAL ? 1 : 0; 592 if (i == 0 && vmc->vmc_bootdevice & VMBOOTDEV_NET) 593 dev->vionet.pxeboot = 1; 594 595 log_debug("%s: vm \"%s\" vio%u lladdr %s%s%s%s", 596 __func__, vcp->vcp_name, i, 597 ether_ntoa((void *)dev->vionet.mac), 598 dev->vionet.lockedmac ? ", locked" : "", 599 dev->vionet.local ? ", local" : "", 600 dev->vionet.pxeboot ? ", pxeboot" : ""); 601 602 /* Add the vionet to our device list. */ 603 dev->vionet.idx = i; 604 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next); 605 } 606 } 607 608 if (vmc->vmc_ndisks > 0) { 609 for (i = 0; i < vmc->vmc_ndisks; i++) { 610 dev = calloc(1, sizeof(struct virtio_dev)); 611 if (dev == NULL) { 612 log_warn("%s: calloc failure allocating vioblk", 613 __progname); 614 return; 615 } 616 617 /* One vioblk device for each disk defined in vcp */ 618 dev->dev_type = VMD_DEVTYPE_DISK; 619 620 if (pci_add_device(&id, PCI_VENDOR_QUMRANET, 621 PCI_PRODUCT_QUMRANET_VIO_BLOCK, 622 PCI_CLASS_MASS_STORAGE, 623 PCI_SUBCLASS_MASS_STORAGE_SCSI, 624 PCI_VENDOR_OPENBSD, 625 PCI_PRODUCT_VIRTIO_BLOCK, 1, NULL)) { 626 log_warnx("%s: can't add PCI virtio block " 627 "device", __progname); 628 return; 629 } 630 dev->pci_id = id; 631 dev->sync_fd = -1; 632 dev->async_fd = -1; 633 dev->vm_id = vcp->vcp_id; 634 dev->vm_vmid = vm->vm_vmid; 635 dev->irq = pci_get_dev_irq(id); 636 637 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io, 638 &dev->vioblk)) { 639 log_warnx("%s: can't add bar for virtio block " 640 "device", __progname); 641 return; 642 } 643 dev->vioblk.vq[0].qs = VIOBLK_QUEUE_SIZE; 644 dev->vioblk.vq[0].vq_availoffset = 645 sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE; 646 dev->vioblk.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN( 647 sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE 648 + sizeof(uint16_t) * (2 + VIOBLK_QUEUE_SIZE)); 649 dev->vioblk.vq[0].last_avail = 0; 650 dev->vioblk.cfg.device_feature = 651 VIRTIO_BLK_F_SIZE_MAX; 652 dev->vioblk.max_xfer = 1048576; 653 654 /* 655 * Initialize disk fds to an invalid fd (-1), then 656 * set any child disk fds. 657 */ 658 memset(&dev->vioblk.disk_fd, -1, 659 sizeof(dev->vioblk.disk_fd)); 660 dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i]; 661 for (j = 0; j < dev->vioblk.ndisk_fd; j++) 662 dev->vioblk.disk_fd[j] = child_disks[i][j]; 663 664 dev->vioblk.idx = i; 665 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next); 666 } 667 } 668 669 /* 670 * Launch virtio devices that support subprocess execution. 671 */ 672 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 673 if (virtio_dev_launch(vm, dev) != 0) 674 fatalx("failed to launch virtio device"); 675 } 676 677 /* vioscsi cdrom */ 678 if (strlen(vmc->vmc_cdrom)) { 679 vioscsi = calloc(1, sizeof(struct vioscsi_dev)); 680 if (vioscsi == NULL) { 681 log_warn("%s: calloc failure allocating vioscsi", 682 __progname); 683 return; 684 } 685 686 if (pci_add_device(&id, PCI_VENDOR_QUMRANET, 687 PCI_PRODUCT_QUMRANET_VIO_SCSI, 688 PCI_CLASS_MASS_STORAGE, 689 PCI_SUBCLASS_MASS_STORAGE_SCSI, 690 PCI_VENDOR_OPENBSD, 691 PCI_PRODUCT_VIRTIO_SCSI, 1, NULL)) { 692 log_warnx("%s: can't add PCI vioscsi device", 693 __progname); 694 return; 695 } 696 697 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vioscsi_io, vioscsi)) { 698 log_warnx("%s: can't add bar for vioscsi device", 699 __progname); 700 return; 701 } 702 703 for (i = 0; i < VIRTIO_MAX_QUEUES; i++) { 704 vioscsi->vq[i].qs = VIOSCSI_QUEUE_SIZE; 705 vioscsi->vq[i].vq_availoffset = 706 sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE; 707 vioscsi->vq[i].vq_usedoffset = VIRTQUEUE_ALIGN( 708 sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE 709 + sizeof(uint16_t) * (2 + VIOSCSI_QUEUE_SIZE)); 710 vioscsi->vq[i].last_avail = 0; 711 } 712 if (virtio_raw_init(&vioscsi->file, &vioscsi->sz, &child_cdrom, 713 1) == -1) { 714 log_warnx("%s: unable to determine iso format", 715 __func__); 716 return; 717 } 718 vioscsi->locked = 0; 719 vioscsi->lba = 0; 720 vioscsi->n_blocks = vioscsi->sz >> 2; /* num of 2048 blocks in file */ 721 vioscsi->max_xfer = VIOSCSI_BLOCK_SIZE_CDROM; 722 vioscsi->pci_id = id; 723 vioscsi->vm_id = vcp->vcp_id; 724 vioscsi->irq = pci_get_dev_irq(id); 725 } 726 727 /* virtio control device */ 728 if (pci_add_device(&id, PCI_VENDOR_OPENBSD, 729 PCI_PRODUCT_OPENBSD_CONTROL, 730 PCI_CLASS_COMMUNICATIONS, 731 PCI_SUBCLASS_COMMUNICATIONS_MISC, 732 PCI_VENDOR_OPENBSD, 733 PCI_PRODUCT_VIRTIO_VMMCI, 1, NULL)) { 734 log_warnx("%s: can't add PCI vmm control device", 735 __progname); 736 return; 737 } 738 739 if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vmmci_io, NULL)) { 740 log_warnx("%s: can't add bar for vmm control device", 741 __progname); 742 return; 743 } 744 745 memset(&vmmci, 0, sizeof(vmmci)); 746 vmmci.cfg.device_feature = VMMCI_F_TIMESYNC | VMMCI_F_ACK | 747 VMMCI_F_SYNCRTC; 748 vmmci.vm_id = vcp->vcp_id; 749 vmmci.irq = pci_get_dev_irq(id); 750 vmmci.pci_id = id; 751 752 evtimer_set(&vmmci.timeout, vmmci_timeout, NULL); 753 } 754 755 /* 756 * vionet_set_hostmac 757 * 758 * Sets the hardware address for the host-side tap(4) on a vionet_dev. 759 * 760 * This should only be called from the event-loop thread 761 * 762 * vm: pointer to the current vmd_vm instance 763 * idx: index into the array of vionet_dev's for the target vionet_dev 764 * addr: ethernet address to set 765 */ 766 void 767 vionet_set_hostmac(struct vmd_vm *vm, unsigned int idx, uint8_t *addr) 768 { 769 struct vmop_create_params *vmc = &vm->vm_params; 770 struct virtio_dev *dev; 771 struct vionet_dev *vionet = NULL; 772 int ret; 773 774 if (idx > vmc->vmc_nnics) 775 fatalx("%s: invalid vionet index: %u", __func__, idx); 776 777 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 778 if (dev->dev_type == VMD_DEVTYPE_NET 779 && dev->vionet.idx == idx) { 780 vionet = &dev->vionet; 781 break; 782 } 783 } 784 if (vionet == NULL) 785 fatalx("%s: dev == NULL, idx = %u", __func__, idx); 786 787 /* Set the local vm process copy. */ 788 memcpy(vionet->hostmac, addr, sizeof(vionet->hostmac)); 789 790 /* Send the information to the device process. */ 791 ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_HOSTMAC, 0, 0, -1, 792 vionet->hostmac, sizeof(vionet->hostmac)); 793 if (ret == -1) { 794 log_warnx("%s: failed to queue hostmac to vionet dev %u", 795 __func__, idx); 796 return; 797 } 798 } 799 800 void 801 virtio_shutdown(struct vmd_vm *vm) 802 { 803 int ret, status; 804 pid_t pid = 0; 805 struct virtio_dev *dev, *tmp; 806 struct viodev_msg msg; 807 struct imsgbuf *ibuf; 808 809 /* Ensure that our disks are synced. */ 810 if (vioscsi != NULL) 811 vioscsi->file.close(vioscsi->file.p, 0); 812 813 /* 814 * Broadcast shutdown to child devices. We need to do this 815 * synchronously as we have already stopped the async event thread. 816 */ 817 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 818 memset(&msg, 0, sizeof(msg)); 819 msg.type = VIODEV_MSG_SHUTDOWN; 820 ibuf = &dev->sync_iev.ibuf; 821 ret = imsg_compose(ibuf, VIODEV_MSG_SHUTDOWN, 0, 0, -1, 822 &msg, sizeof(msg)); 823 if (ret == -1) 824 fatalx("%s: failed to send shutdown to device", 825 __func__); 826 if (imsg_flush(ibuf) == -1) 827 fatalx("%s: imsg_flush", __func__); 828 } 829 830 /* 831 * Wait for all children to shutdown using a simple approach of 832 * iterating over known child devices and waiting for them to die. 833 */ 834 SLIST_FOREACH_SAFE(dev, &virtio_devs, dev_next, tmp) { 835 log_debug("%s: waiting on device pid %d", __func__, 836 dev->dev_pid); 837 do { 838 pid = waitpid(dev->dev_pid, &status, WNOHANG); 839 } while (pid == 0 || (pid == -1 && errno == EINTR)); 840 if (pid == dev->dev_pid) 841 log_debug("%s: device for pid %d is stopped", 842 __func__, pid); 843 else 844 log_warnx("%s: unexpected pid %d", __func__, pid); 845 free(dev); 846 } 847 } 848 849 int 850 vmmci_restore(int fd, uint32_t vm_id) 851 { 852 log_debug("%s: receiving vmmci", __func__); 853 if (atomicio(read, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) { 854 log_warnx("%s: error reading vmmci from fd", __func__); 855 return (-1); 856 } 857 858 if (pci_set_bar_fn(vmmci.pci_id, 0, vmmci_io, NULL)) { 859 log_warnx("%s: can't set bar fn for vmm control device", 860 __progname); 861 return (-1); 862 } 863 vmmci.vm_id = vm_id; 864 vmmci.irq = pci_get_dev_irq(vmmci.pci_id); 865 memset(&vmmci.timeout, 0, sizeof(struct event)); 866 evtimer_set(&vmmci.timeout, vmmci_timeout, NULL); 867 return (0); 868 } 869 870 int 871 viornd_restore(int fd, struct vmd_vm *vm) 872 { 873 void *hva = NULL; 874 875 log_debug("%s: receiving viornd", __func__); 876 if (atomicio(read, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) { 877 log_warnx("%s: error reading viornd from fd", __func__); 878 return (-1); 879 } 880 if (pci_set_bar_fn(viornd.pci_id, 0, virtio_rnd_io, NULL)) { 881 log_warnx("%s: can't set bar fn for virtio rng device", 882 __progname); 883 return (-1); 884 } 885 viornd.vm_id = vm->vm_params.vmc_params.vcp_id; 886 viornd.irq = pci_get_dev_irq(viornd.pci_id); 887 888 hva = hvaddr_mem(viornd.vq[0].q_gpa, vring_size(VIORND_QUEUE_SIZE)); 889 if (hva == NULL) 890 fatal("failed to restore viornd virtqueue"); 891 viornd.vq[0].q_hva = hva; 892 893 return (0); 894 } 895 896 int 897 vionet_restore(int fd, struct vmd_vm *vm, int *child_taps) 898 { 899 struct vmop_create_params *vmc = &vm->vm_params; 900 struct vm_create_params *vcp = &vmc->vmc_params; 901 struct virtio_dev *dev; 902 uint8_t i; 903 904 if (vmc->vmc_nnics == 0) 905 return (0); 906 907 for (i = 0; i < vmc->vmc_nnics; i++) { 908 dev = calloc(1, sizeof(struct virtio_dev)); 909 if (dev == NULL) { 910 log_warn("%s: calloc failure allocating vionet", 911 __progname); 912 return (-1); 913 } 914 915 log_debug("%s: receiving virtio network device", __func__); 916 if (atomicio(read, fd, dev, sizeof(struct virtio_dev)) 917 != sizeof(struct virtio_dev)) { 918 log_warnx("%s: error reading vionet from fd", 919 __func__); 920 return (-1); 921 } 922 923 /* Virtio network */ 924 if (dev->dev_type != VMD_DEVTYPE_NET) { 925 log_warnx("%s: invalid device type", __func__); 926 return (-1); 927 } 928 929 dev->sync_fd = -1; 930 dev->async_fd = -1; 931 dev->vm_id = vcp->vcp_id; 932 dev->vm_vmid = vm->vm_vmid; 933 dev->irq = pci_get_dev_irq(dev->pci_id); 934 935 if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) { 936 log_warnx("%s: can't set bar fn for virtio net " 937 "device", __progname); 938 return (-1); 939 } 940 941 dev->vionet.data_fd = child_taps[i]; 942 dev->vionet.idx = i; 943 944 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next); 945 } 946 947 return (0); 948 } 949 950 int 951 vioblk_restore(int fd, struct vmd_vm *vm, 952 int child_disks[][VM_MAX_BASE_PER_DISK]) 953 { 954 struct vmop_create_params *vmc = &vm->vm_params; 955 struct virtio_dev *dev; 956 uint8_t i, j; 957 958 if (vmc->vmc_ndisks == 0) 959 return (0); 960 961 for (i = 0; i < vmc->vmc_ndisks; i++) { 962 dev = calloc(1, sizeof(struct virtio_dev)); 963 if (dev == NULL) { 964 log_warn("%s: calloc failure allocating vioblks", 965 __progname); 966 return (-1); 967 } 968 969 log_debug("%s: receiving vioblk", __func__); 970 if (atomicio(read, fd, dev, sizeof(struct virtio_dev)) 971 != sizeof(struct virtio_dev)) { 972 log_warnx("%s: error reading vioblk from fd", __func__); 973 return (-1); 974 } 975 if (dev->dev_type != VMD_DEVTYPE_DISK) { 976 log_warnx("%s: invalid device type", __func__); 977 return (-1); 978 } 979 980 dev->sync_fd = -1; 981 dev->async_fd = -1; 982 983 if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) { 984 log_warnx("%s: can't set bar fn for virtio block " 985 "device", __progname); 986 return (-1); 987 } 988 dev->vm_id = vmc->vmc_params.vcp_id; 989 dev->irq = pci_get_dev_irq(dev->pci_id); 990 991 memset(&dev->vioblk.disk_fd, -1, sizeof(dev->vioblk.disk_fd)); 992 dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i]; 993 for (j = 0; j < dev->vioblk.ndisk_fd; j++) 994 dev->vioblk.disk_fd[j] = child_disks[i][j]; 995 996 dev->vioblk.idx = i; 997 SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next); 998 } 999 return (0); 1000 } 1001 1002 int 1003 vioscsi_restore(int fd, struct vmd_vm *vm, int child_cdrom) 1004 { 1005 void *hva = NULL; 1006 unsigned int i; 1007 1008 if (!strlen(vm->vm_params.vmc_cdrom)) 1009 return (0); 1010 1011 vioscsi = calloc(1, sizeof(struct vioscsi_dev)); 1012 if (vioscsi == NULL) { 1013 log_warn("%s: calloc failure allocating vioscsi", __progname); 1014 return (-1); 1015 } 1016 1017 log_debug("%s: receiving vioscsi", __func__); 1018 1019 if (atomicio(read, fd, vioscsi, sizeof(struct vioscsi_dev)) != 1020 sizeof(struct vioscsi_dev)) { 1021 log_warnx("%s: error reading vioscsi from fd", __func__); 1022 return (-1); 1023 } 1024 1025 if (pci_set_bar_fn(vioscsi->pci_id, 0, vioscsi_io, vioscsi)) { 1026 log_warnx("%s: can't set bar fn for vmm control device", 1027 __progname); 1028 return (-1); 1029 } 1030 1031 vioscsi->vm_id = vm->vm_params.vmc_params.vcp_id; 1032 vioscsi->irq = pci_get_dev_irq(vioscsi->pci_id); 1033 1034 /* vioscsi uses 3 virtqueues. */ 1035 for (i = 0; i < 3; i++) { 1036 hva = hvaddr_mem(vioscsi->vq[i].q_gpa, 1037 vring_size(VIOSCSI_QUEUE_SIZE)); 1038 if (hva == NULL) 1039 fatal("failed to restore vioscsi virtqueue"); 1040 vioscsi->vq[i].q_hva = hva; 1041 } 1042 1043 return (0); 1044 } 1045 1046 int 1047 virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, 1048 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 1049 { 1050 struct virtio_dev *dev; 1051 int ret; 1052 1053 SLIST_INIT(&virtio_devs); 1054 1055 if ((ret = viornd_restore(fd, vm)) == -1) 1056 return (ret); 1057 1058 if ((ret = vioblk_restore(fd, vm, child_disks)) == -1) 1059 return (ret); 1060 1061 if ((ret = vioscsi_restore(fd, vm, child_cdrom)) == -1) 1062 return (ret); 1063 1064 if ((ret = vionet_restore(fd, vm, child_taps)) == -1) 1065 return (ret); 1066 1067 if ((ret = vmmci_restore(fd, vm->vm_params.vmc_params.vcp_id)) == -1) 1068 return (ret); 1069 1070 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 1071 if (virtio_dev_launch(vm, dev) != 0) 1072 fatalx("%s: failed to restore virtio dev", __func__); 1073 } 1074 1075 return (0); 1076 } 1077 1078 int 1079 viornd_dump(int fd) 1080 { 1081 log_debug("%s: sending viornd", __func__); 1082 1083 viornd.vq[0].q_hva = NULL; 1084 1085 if (atomicio(vwrite, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) { 1086 log_warnx("%s: error writing viornd to fd", __func__); 1087 return (-1); 1088 } 1089 return (0); 1090 } 1091 1092 int 1093 vmmci_dump(int fd) 1094 { 1095 log_debug("%s: sending vmmci", __func__); 1096 1097 if (atomicio(vwrite, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) { 1098 log_warnx("%s: error writing vmmci to fd", __func__); 1099 return (-1); 1100 } 1101 return (0); 1102 } 1103 1104 int 1105 vionet_dump(int fd) 1106 { 1107 struct virtio_dev *dev, temp; 1108 struct viodev_msg msg; 1109 struct imsg imsg; 1110 struct imsgbuf *ibuf = NULL; 1111 size_t sz; 1112 int ret; 1113 1114 log_debug("%s: dumping vionet", __func__); 1115 1116 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 1117 if (dev->dev_type != VMD_DEVTYPE_NET) 1118 continue; 1119 1120 memset(&msg, 0, sizeof(msg)); 1121 memset(&imsg, 0, sizeof(imsg)); 1122 1123 ibuf = &dev->sync_iev.ibuf; 1124 msg.type = VIODEV_MSG_DUMP; 1125 1126 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, 1127 sizeof(msg)); 1128 if (ret == -1) { 1129 log_warnx("%s: failed requesting dump of vionet[%d]", 1130 __func__, dev->vionet.idx); 1131 return (-1); 1132 } 1133 if (imsg_flush(ibuf) == -1) { 1134 log_warnx("%s: imsg_flush", __func__); 1135 return (-1); 1136 } 1137 1138 sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp)); 1139 if (sz != sizeof(temp)) { 1140 log_warnx("%s: failed to dump vionet[%d]", __func__, 1141 dev->vionet.idx); 1142 return (-1); 1143 } 1144 1145 temp.vionet.vq[RXQ].q_hva = NULL; 1146 temp.vionet.vq[TXQ].q_hva = NULL; 1147 temp.async_fd = -1; 1148 temp.sync_fd = -1; 1149 memset(&temp.async_iev, 0, sizeof(temp.async_iev)); 1150 memset(&temp.sync_iev, 0, sizeof(temp.sync_iev)); 1151 1152 if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) { 1153 log_warnx("%s: error writing vionet to fd", __func__); 1154 return (-1); 1155 } 1156 } 1157 1158 return (0); 1159 } 1160 1161 int 1162 vioblk_dump(int fd) 1163 { 1164 struct virtio_dev *dev, temp; 1165 struct viodev_msg msg; 1166 struct imsg imsg; 1167 struct imsgbuf *ibuf = NULL; 1168 size_t sz; 1169 int ret; 1170 1171 log_debug("%s: dumping vioblk", __func__); 1172 1173 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 1174 if (dev->dev_type != VMD_DEVTYPE_DISK) 1175 continue; 1176 1177 memset(&msg, 0, sizeof(msg)); 1178 memset(&imsg, 0, sizeof(imsg)); 1179 1180 ibuf = &dev->sync_iev.ibuf; 1181 msg.type = VIODEV_MSG_DUMP; 1182 1183 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, 1184 sizeof(msg)); 1185 if (ret == -1) { 1186 log_warnx("%s: failed requesting dump of vioblk[%d]", 1187 __func__, dev->vioblk.idx); 1188 return (-1); 1189 } 1190 if (imsg_flush(ibuf) == -1) { 1191 log_warnx("%s: imsg_flush", __func__); 1192 return (-1); 1193 } 1194 1195 1196 sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp)); 1197 if (sz != sizeof(temp)) { 1198 log_warnx("%s: failed to dump vioblk[%d]", __func__, 1199 dev->vioblk.idx); 1200 return (-1); 1201 } 1202 1203 temp.vioblk.vq[0].q_hva = NULL; 1204 temp.async_fd = -1; 1205 temp.sync_fd = -1; 1206 memset(&temp.async_iev, 0, sizeof(temp.async_iev)); 1207 memset(&temp.sync_iev, 0, sizeof(temp.sync_iev)); 1208 1209 if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) { 1210 log_warnx("%s: error writing vioblk to fd", __func__); 1211 return (-1); 1212 } 1213 } 1214 1215 return (0); 1216 } 1217 1218 int 1219 vioscsi_dump(int fd) 1220 { 1221 unsigned int i; 1222 1223 if (vioscsi == NULL) 1224 return (0); 1225 1226 log_debug("%s: sending vioscsi", __func__); 1227 1228 for (i = 0; i < 3; i++) 1229 vioscsi->vq[i].q_hva = NULL; 1230 1231 if (atomicio(vwrite, fd, vioscsi, sizeof(struct vioscsi_dev)) != 1232 sizeof(struct vioscsi_dev)) { 1233 log_warnx("%s: error writing vioscsi to fd", __func__); 1234 return (-1); 1235 } 1236 return (0); 1237 } 1238 1239 int 1240 virtio_dump(int fd) 1241 { 1242 int ret; 1243 1244 if ((ret = viornd_dump(fd)) == -1) 1245 return ret; 1246 1247 if ((ret = vioblk_dump(fd)) == -1) 1248 return ret; 1249 1250 if ((ret = vioscsi_dump(fd)) == -1) 1251 return ret; 1252 1253 if ((ret = vionet_dump(fd)) == -1) 1254 return ret; 1255 1256 if ((ret = vmmci_dump(fd)) == -1) 1257 return ret; 1258 1259 return (0); 1260 } 1261 1262 void 1263 virtio_stop(struct vmd_vm *vm) 1264 { 1265 struct virtio_dev *dev; 1266 int ret; 1267 1268 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 1269 ret = imsg_compose_event(&dev->async_iev, IMSG_VMDOP_PAUSE_VM, 1270 0, 0, -1, NULL, 0); 1271 if (ret == -1) { 1272 log_warnx("%s: failed to compose pause msg to device", 1273 __func__); 1274 } 1275 } 1276 } 1277 1278 void 1279 virtio_start(struct vmd_vm *vm) 1280 { 1281 struct virtio_dev *dev; 1282 int ret; 1283 1284 SLIST_FOREACH(dev, &virtio_devs, dev_next) { 1285 ret = imsg_compose_event(&dev->async_iev, IMSG_VMDOP_UNPAUSE_VM, 1286 0, 0, -1, NULL, 0); 1287 if (ret == -1) { 1288 log_warnx("%s: failed to compose start msg to device", 1289 __func__); 1290 } 1291 } 1292 } 1293 1294 /* 1295 * Fork+exec a child virtio device. Returns 0 on success. 1296 */ 1297 static int 1298 virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev) 1299 { 1300 char *nargv[10], num[32], vmm_fd[32], t[2]; 1301 pid_t dev_pid; 1302 int data_fds[VM_MAX_BASE_PER_DISK], sync_fds[2], async_fds[2], ret = 0; 1303 size_t i, j, data_fds_sz, sz = 0; 1304 struct virtio_dev *d = NULL; 1305 struct viodev_msg msg; 1306 struct imsg imsg; 1307 struct imsgev *iev = &dev->sync_iev; 1308 1309 switch (dev->dev_type) { 1310 case VMD_DEVTYPE_NET: 1311 data_fds[0] = dev->vionet.data_fd; 1312 data_fds_sz = 1; 1313 log_debug("%s: launching vionet[%d]", 1314 vm->vm_params.vmc_params.vcp_name, dev->vionet.idx); 1315 break; 1316 case VMD_DEVTYPE_DISK: 1317 memcpy(&data_fds, dev->vioblk.disk_fd, sizeof(data_fds)); 1318 data_fds_sz = dev->vioblk.ndisk_fd; 1319 log_debug("%s: launching vioblk[%d]", 1320 vm->vm_params.vmc_params.vcp_name, dev->vioblk.idx); 1321 break; 1322 /* NOTREACHED */ 1323 default: 1324 log_warn("%s: invalid device type", __func__); 1325 return (EINVAL); 1326 } 1327 1328 /* We need two channels: one synchronous (IO reads) and one async. */ 1329 if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, sync_fds) == -1) { 1330 log_warn("failed to create socketpair"); 1331 return (errno); 1332 } 1333 if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, async_fds) == -1) { 1334 log_warn("failed to create async socketpair"); 1335 return (errno); 1336 } 1337 1338 /* Keep communication channels open after exec. */ 1339 if (fcntl(sync_fds[1], F_SETFD, 0)) { 1340 ret = errno; 1341 log_warn("%s: fcntl", __func__); 1342 goto err; 1343 } 1344 if (fcntl(async_fds[1], F_SETFD, 0)) { 1345 ret = errno; 1346 log_warn("%s: fcnt", __func__); 1347 goto err; 1348 } 1349 1350 /* Keep data file descriptors open after exec. */ 1351 for (i = 0; i < data_fds_sz; i++) { 1352 log_debug("%s: marking fd %d !close-on-exec", __func__, 1353 data_fds[i]); 1354 if (fcntl(data_fds[i], F_SETFD, 0)) { 1355 ret = errno; 1356 log_warn("%s: fcntl", __func__); 1357 goto err; 1358 } 1359 } 1360 1361 /* Fork... */ 1362 dev_pid = fork(); 1363 if (dev_pid == -1) { 1364 ret = errno; 1365 log_warn("%s: fork failed", __func__); 1366 goto err; 1367 } 1368 1369 if (dev_pid > 0) { 1370 /* Parent */ 1371 close_fd(sync_fds[1]); 1372 close_fd(async_fds[1]); 1373 1374 /* Save the child's pid to help with cleanup. */ 1375 dev->dev_pid = dev_pid; 1376 1377 /* Set the channel fds to the child's before sending. */ 1378 dev->sync_fd = sync_fds[1]; 1379 dev->async_fd = async_fds[1]; 1380 1381 /* Close data fds. Only the child device needs them now. */ 1382 for (i = 0; i < data_fds_sz; i++) 1383 close_fd(data_fds[i]); 1384 1385 /* Set our synchronous channel to non-blocking. */ 1386 if (fcntl(sync_fds[0], F_SETFL, O_NONBLOCK) == -1) { 1387 ret = errno; 1388 log_warn("%s: fcntl", __func__); 1389 goto err; 1390 } 1391 1392 /* 1. Send over our configured device. */ 1393 log_debug("%s: sending '%c' type device struct", __func__, 1394 dev->dev_type); 1395 sz = atomicio(vwrite, sync_fds[0], dev, sizeof(*dev)); 1396 if (sz != sizeof(*dev)) { 1397 log_warnx("%s: failed to send device", __func__); 1398 ret = EIO; 1399 goto err; 1400 } 1401 1402 /* 2. Send over details on the VM (including memory fds). */ 1403 log_debug("%s: sending vm message for '%s'", __func__, 1404 vm->vm_params.vmc_params.vcp_name); 1405 sz = atomicio(vwrite, sync_fds[0], vm, sizeof(*vm)); 1406 if (sz != sizeof(*vm)) { 1407 log_warnx("%s: failed to send vm details", __func__); 1408 ret = EIO; 1409 goto err; 1410 } 1411 1412 /* 1413 * Initialize our imsg channel to the child device. The initial 1414 * communication will be synchronous. We expect the child to 1415 * report itself "ready" to confirm the launch was a success. 1416 */ 1417 imsg_init(&iev->ibuf, sync_fds[0]); 1418 do 1419 ret = imsg_read(&iev->ibuf); 1420 while (ret == -1 && errno == EAGAIN); 1421 if (ret == 0 || ret == -1) { 1422 log_warnx("%s: failed to receive ready message from " 1423 "'%c' type device", __func__, dev->dev_type); 1424 ret = EIO; 1425 goto err; 1426 } 1427 ret = 0; 1428 1429 log_debug("%s: receiving reply", __func__); 1430 if (imsg_get(&iev->ibuf, &imsg) < 1) { 1431 log_warnx("%s: imsg_get", __func__); 1432 ret = EIO; 1433 goto err; 1434 } 1435 IMSG_SIZE_CHECK(&imsg, &msg); 1436 memcpy(&msg, imsg.data, sizeof(msg)); 1437 imsg_free(&imsg); 1438 1439 if (msg.type != VIODEV_MSG_READY) { 1440 log_warnx("%s: expected ready message, got type %d", 1441 __func__, msg.type); 1442 ret = EINVAL; 1443 goto err; 1444 } 1445 log_debug("%s: device reports ready via sync channel", 1446 __func__); 1447 1448 /* 1449 * Wire in the async event handling, but after reverting back 1450 * to the parent's fd's. 1451 */ 1452 dev->sync_fd = sync_fds[0]; 1453 dev->async_fd = async_fds[0]; 1454 vm_device_pipe(dev, virtio_dispatch_dev); 1455 } else { 1456 /* Child */ 1457 close_fd(async_fds[0]); 1458 close_fd(sync_fds[0]); 1459 1460 /* 1461 * Close any other device fd's we know aren't 1462 * ours. This releases any exclusive locks held on 1463 * things like disk images. 1464 */ 1465 SLIST_FOREACH(d, &virtio_devs, dev_next) { 1466 if (d == dev) 1467 continue; 1468 1469 switch (d->dev_type) { 1470 case VMD_DEVTYPE_DISK: 1471 for (j = 0; j < d->vioblk.ndisk_fd; j++) 1472 close_fd(d->vioblk.disk_fd[j]); 1473 break; 1474 case VMD_DEVTYPE_NET: 1475 close_fd(d->vionet.data_fd); 1476 break; 1477 default: 1478 fatalx("%s: invalid device type '%c'", 1479 __func__, d->dev_type); 1480 } 1481 } 1482 1483 memset(&nargv, 0, sizeof(nargv)); 1484 memset(num, 0, sizeof(num)); 1485 snprintf(num, sizeof(num), "%d", sync_fds[1]); 1486 memset(vmm_fd, 0, sizeof(vmm_fd)); 1487 snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd); 1488 1489 t[0] = dev->dev_type; 1490 t[1] = '\0'; 1491 1492 nargv[0] = env->argv0; 1493 nargv[1] = "-X"; 1494 nargv[2] = num; 1495 nargv[3] = "-t"; 1496 nargv[4] = t; 1497 nargv[5] = "-i"; 1498 nargv[6] = vmm_fd; 1499 nargv[7] = "-n"; 1500 1501 if (env->vmd_verbose) { 1502 nargv[8] = "-v"; 1503 nargv[9] = NULL; 1504 } else 1505 nargv[8] = NULL; 1506 1507 /* Control resumes in vmd.c:main(). */ 1508 execvp(nargv[0], nargv); 1509 1510 ret = errno; 1511 log_warn("%s: failed to exec device", __func__); 1512 _exit(ret); 1513 /* NOTREACHED */ 1514 } 1515 1516 return (ret); 1517 1518 err: 1519 close_fd(sync_fds[0]); 1520 close_fd(sync_fds[1]); 1521 close_fd(async_fds[0]); 1522 close_fd(async_fds[1]); 1523 return (ret); 1524 } 1525 1526 /* 1527 * Initialize an async imsg channel for a virtio device. 1528 */ 1529 int 1530 vm_device_pipe(struct virtio_dev *dev, void (*cb)(int, short, void *)) 1531 { 1532 struct imsgev *iev = &dev->async_iev; 1533 int fd = dev->async_fd; 1534 1535 log_debug("%s: initializing '%c' device pipe (fd=%d)", __func__, 1536 dev->dev_type, fd); 1537 1538 if (fcntl(fd, F_SETFL, O_NONBLOCK) == -1) { 1539 log_warn("failed to set nonblocking mode on vm device pipe"); 1540 return (-1); 1541 } 1542 1543 imsg_init(&iev->ibuf, fd); 1544 iev->handler = cb; 1545 iev->data = dev; 1546 iev->events = EV_READ; 1547 imsg_event_add(iev); 1548 1549 return (0); 1550 } 1551 1552 void 1553 virtio_dispatch_dev(int fd, short event, void *arg) 1554 { 1555 struct virtio_dev *dev = (struct virtio_dev*)arg; 1556 struct imsgev *iev = &dev->async_iev; 1557 struct imsgbuf *ibuf = &iev->ibuf; 1558 struct imsg imsg; 1559 struct viodev_msg msg; 1560 ssize_t n = 0; 1561 1562 if (event & EV_READ) { 1563 if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) 1564 fatal("%s: imsg_read", __func__); 1565 if (n == 0) { 1566 /* this pipe is dead, so remove the event handler */ 1567 log_debug("%s: pipe dead (EV_READ)", __func__); 1568 event_del(&iev->ev); 1569 event_loopexit(NULL); 1570 return; 1571 } 1572 } 1573 1574 if (event & EV_WRITE) { 1575 if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) 1576 fatal("%s: msgbuf_write", __func__); 1577 if (n == 0) { 1578 /* this pipe is dead, so remove the event handler */ 1579 log_debug("%s: pipe dead (EV_WRITE)", __func__); 1580 event_del(&iev->ev); 1581 event_loopexit(NULL); 1582 return; 1583 } 1584 } 1585 1586 for (;;) { 1587 if ((n = imsg_get(ibuf, &imsg)) == -1) 1588 fatal("%s: imsg_get", __func__); 1589 if (n == 0) 1590 break; 1591 1592 switch (imsg.hdr.type) { 1593 case IMSG_DEVOP_MSG: 1594 IMSG_SIZE_CHECK(&imsg, &msg); 1595 memcpy(&msg, imsg.data, sizeof(msg)); 1596 handle_dev_msg(&msg, dev); 1597 break; 1598 default: 1599 log_warnx("%s: got non devop imsg %d", __func__, 1600 imsg.hdr.type); 1601 break; 1602 } 1603 imsg_free(&imsg); 1604 } 1605 imsg_event_add(iev); 1606 } 1607 1608 1609 static int 1610 handle_dev_msg(struct viodev_msg *msg, struct virtio_dev *gdev) 1611 { 1612 uint32_t vm_id = gdev->vm_id; 1613 int irq = gdev->irq; 1614 1615 switch (msg->type) { 1616 case VIODEV_MSG_KICK: 1617 if (msg->state == INTR_STATE_ASSERT) 1618 vcpu_assert_pic_irq(vm_id, msg->vcpu, irq); 1619 else if (msg->state == INTR_STATE_DEASSERT) 1620 vcpu_deassert_pic_irq(vm_id, msg->vcpu, irq); 1621 break; 1622 case VIODEV_MSG_READY: 1623 log_debug("%s: device reports ready", __func__); 1624 break; 1625 case VIODEV_MSG_ERROR: 1626 log_warnx("%s: device reported error", __func__); 1627 break; 1628 case VIODEV_MSG_INVALID: 1629 case VIODEV_MSG_IO_READ: 1630 case VIODEV_MSG_IO_WRITE: 1631 /* FALLTHROUGH */ 1632 default: 1633 log_warnx("%s: unsupported device message type %d", __func__, 1634 msg->type); 1635 return (1); 1636 } 1637 1638 return (0); 1639 }; 1640 1641 /* 1642 * Called by the VM process while processing IO from the VCPU thread. 1643 * 1644 * N.b. Since the VCPU thread calls this function, we cannot mutate the event 1645 * system. All ipc messages must be sent manually and cannot be queued for 1646 * the event loop to push them. (We need to perform a synchronous read, so 1647 * this isn't really a big deal.) 1648 */ 1649 int 1650 virtio_pci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, 1651 void *cookie, uint8_t sz) 1652 { 1653 struct virtio_dev *dev = (struct virtio_dev *)cookie; 1654 struct imsgbuf *ibuf = &dev->sync_iev.ibuf; 1655 struct imsg imsg; 1656 struct viodev_msg msg; 1657 ssize_t n; 1658 int ret = 0; 1659 1660 memset(&msg, 0, sizeof(msg)); 1661 msg.reg = reg; 1662 msg.io_sz = sz; 1663 1664 if (dir == 0) { 1665 msg.type = VIODEV_MSG_IO_WRITE; 1666 msg.data = *data; 1667 msg.data_valid = 1; 1668 } else 1669 msg.type = VIODEV_MSG_IO_READ; 1670 1671 if (msg.type == VIODEV_MSG_IO_WRITE) { 1672 /* 1673 * Write request. No reply expected. 1674 */ 1675 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, 1676 sizeof(msg)); 1677 if (ret == -1) { 1678 log_warn("%s: failed to send async io event to vionet" 1679 " device", __func__); 1680 return (ret); 1681 } 1682 if (imsg_flush(ibuf) == -1) { 1683 log_warnx("%s: imsg_flush (write)", __func__); 1684 return (-1); 1685 } 1686 } else { 1687 /* 1688 * Read request. Requires waiting for a reply. 1689 */ 1690 ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, 1691 sizeof(msg)); 1692 if (ret == -1) { 1693 log_warnx("%s: failed to send sync io event to vionet" 1694 " device", __func__); 1695 return (ret); 1696 } 1697 if (imsg_flush(ibuf) == -1) { 1698 log_warnx("%s: imsg_flush (read)", __func__); 1699 return (-1); 1700 } 1701 1702 /* Read our reply. */ 1703 do 1704 n = imsg_read(ibuf); 1705 while (n == -1 && errno == EAGAIN); 1706 if (n == 0 || n == -1) { 1707 log_warn("%s: imsg_read (n=%ld)", __func__, n); 1708 return (-1); 1709 } 1710 if ((n = imsg_get(ibuf, &imsg)) == -1) { 1711 log_warn("%s: imsg_get (n=%ld)", __func__, n); 1712 return (-1); 1713 } 1714 if (n == 0) { 1715 log_warnx("%s: invalid imsg", __func__); 1716 return (-1); 1717 } 1718 1719 IMSG_SIZE_CHECK(&imsg, &msg); 1720 memcpy(&msg, imsg.data, sizeof(msg)); 1721 imsg_free(&imsg); 1722 1723 if (msg.type == VIODEV_MSG_IO_READ && msg.data_valid) { 1724 log_debug("%s: got sync read response (reg=%s)", 1725 __func__, virtio_reg_name(msg.reg)); 1726 *data = msg.data; 1727 /* 1728 * It's possible we're asked to {de,}assert after the 1729 * device performs a register read. 1730 */ 1731 if (msg.state == INTR_STATE_ASSERT) 1732 vcpu_assert_pic_irq(dev->vm_id, msg.vcpu, msg.irq); 1733 else if (msg.state == INTR_STATE_DEASSERT) 1734 vcpu_deassert_pic_irq(dev->vm_id, msg.vcpu, msg.irq); 1735 } else { 1736 log_warnx("%s: expected IO_READ, got %d", __func__, 1737 msg.type); 1738 return (-1); 1739 } 1740 } 1741 1742 return (0); 1743 } 1744 1745 void 1746 virtio_assert_pic_irq(struct virtio_dev *dev, int vcpu) 1747 { 1748 struct viodev_msg msg; 1749 int ret; 1750 1751 memset(&msg, 0, sizeof(msg)); 1752 msg.irq = dev->irq; 1753 msg.vcpu = vcpu; 1754 msg.type = VIODEV_MSG_KICK; 1755 msg.state = INTR_STATE_ASSERT; 1756 1757 ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1, 1758 &msg, sizeof(msg)); 1759 if (ret == -1) 1760 log_warnx("%s: failed to assert irq %d", __func__, dev->irq); 1761 } 1762 1763 void 1764 virtio_deassert_pic_irq(struct virtio_dev *dev, int vcpu) 1765 { 1766 struct viodev_msg msg; 1767 int ret; 1768 1769 memset(&msg, 0, sizeof(msg)); 1770 msg.irq = dev->irq; 1771 msg.vcpu = vcpu; 1772 msg.type = VIODEV_MSG_KICK; 1773 msg.state = INTR_STATE_DEASSERT; 1774 1775 ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1, 1776 &msg, sizeof(msg)); 1777 if (ret == -1) 1778 log_warnx("%s: failed to deassert irq %d", __func__, dev->irq); 1779 } 1780