xref: /openbsd-src/usr.sbin/vmd/virtio.c (revision fc405d53b73a2d73393cb97f684863d17b583e38)
1 /*	$OpenBSD: virtio.c,v 1.103 2023/05/13 23:15:28 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* PAGE_SIZE */
20 #include <sys/socket.h>
21 #include <sys/wait.h>
22 
23 #include <machine/vmmvar.h>
24 #include <dev/pci/pcireg.h>
25 #include <dev/pci/pcidevs.h>
26 #include <dev/pv/virtioreg.h>
27 #include <dev/pci/virtio_pcireg.h>
28 #include <dev/pv/vioblkreg.h>
29 #include <dev/pv/vioscsireg.h>
30 
31 #include <net/if.h>
32 #include <netinet/in.h>
33 #include <netinet/if_ether.h>
34 #include <netinet/ip.h>
35 
36 #include <errno.h>
37 #include <event.h>
38 #include <fcntl.h>
39 #include <poll.h>
40 #include <stddef.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <unistd.h>
44 
45 #include "atomicio.h"
46 #include "pci.h"
47 #include "vioscsi.h"
48 #include "virtio.h"
49 #include "vmd.h"
50 #include "vmm.h"
51 
52 extern struct vmd *env;
53 extern char *__progname;
54 
55 struct viornd_dev viornd;
56 struct vioscsi_dev *vioscsi;
57 struct vmmci_dev vmmci;
58 
59 /* Devices emulated in subprocesses are inserted into this list. */
60 SLIST_HEAD(virtio_dev_head, virtio_dev) virtio_devs;
61 
62 #define MAXPHYS	(64 * 1024)	/* max raw I/O transfer size */
63 
64 #define VIRTIO_NET_F_MAC	(1<<5)
65 
66 #define VMMCI_F_TIMESYNC	(1<<0)
67 #define VMMCI_F_ACK		(1<<1)
68 #define VMMCI_F_SYNCRTC		(1<<2)
69 
70 #define RXQ	0
71 #define TXQ	1
72 
73 static int virtio_dev_launch(struct vmd_vm *, struct virtio_dev *);
74 static void virtio_dispatch_dev(int, short, void *);
75 static int handle_dev_msg(struct viodev_msg *, struct virtio_dev *);
76 
77 const char *
78 virtio_reg_name(uint8_t reg)
79 {
80 	switch (reg) {
81 	case VIRTIO_CONFIG_DEVICE_FEATURES: return "device feature";
82 	case VIRTIO_CONFIG_GUEST_FEATURES: return "guest feature";
83 	case VIRTIO_CONFIG_QUEUE_PFN: return "queue address";
84 	case VIRTIO_CONFIG_QUEUE_SIZE: return "queue size";
85 	case VIRTIO_CONFIG_QUEUE_SELECT: return "queue select";
86 	case VIRTIO_CONFIG_QUEUE_NOTIFY: return "queue notify";
87 	case VIRTIO_CONFIG_DEVICE_STATUS: return "device status";
88 	case VIRTIO_CONFIG_ISR_STATUS: return "isr status";
89 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI...VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
90 		return "device config 0";
91 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
92 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
93 		return "device config 1";
94 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: return "device config 2";
95 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: return "device config 3";
96 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: return "device config 4";
97 	default: return "unknown";
98 	}
99 }
100 
101 uint32_t
102 vring_size(uint32_t vq_size)
103 {
104 	uint32_t allocsize1, allocsize2;
105 
106 	/* allocsize1: descriptor table + avail ring + pad */
107 	allocsize1 = VIRTQUEUE_ALIGN(sizeof(struct vring_desc) * vq_size
108 	    + sizeof(uint16_t) * (2 + vq_size));
109 	/* allocsize2: used ring + pad */
110 	allocsize2 = VIRTQUEUE_ALIGN(sizeof(uint16_t) * 2
111 	    + sizeof(struct vring_used_elem) * vq_size);
112 
113 	return allocsize1 + allocsize2;
114 }
115 
116 /* Update queue select */
117 void
118 viornd_update_qs(void)
119 {
120 	struct virtio_vq_info *vq_info;
121 
122 	/* Invalid queue? */
123 	if (viornd.cfg.queue_select > 0) {
124 		viornd.cfg.queue_size = 0;
125 		return;
126 	}
127 
128 	vq_info = &viornd.vq[viornd.cfg.queue_select];
129 
130 	/* Update queue pfn/size based on queue select */
131 	viornd.cfg.queue_pfn = vq_info->q_gpa >> 12;
132 	viornd.cfg.queue_size = vq_info->qs;
133 }
134 
135 /* Update queue address */
136 void
137 viornd_update_qa(void)
138 {
139 	struct virtio_vq_info *vq_info;
140 	void *hva = NULL;
141 
142 	/* Invalid queue? */
143 	if (viornd.cfg.queue_select > 0)
144 		return;
145 
146 	vq_info = &viornd.vq[viornd.cfg.queue_select];
147 	vq_info->q_gpa = (uint64_t)viornd.cfg.queue_pfn * VIRTIO_PAGE_SIZE;
148 
149 	hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIORND_QUEUE_SIZE));
150 	if (hva == NULL)
151 		fatalx("viornd_update_qa");
152 	vq_info->q_hva = hva;
153 }
154 
155 int
156 viornd_notifyq(void)
157 {
158 	size_t sz;
159 	int dxx, ret;
160 	uint16_t aidx, uidx;
161 	char *vr, *rnd_data;
162 	struct vring_desc *desc;
163 	struct vring_avail *avail;
164 	struct vring_used *used;
165 	struct virtio_vq_info *vq_info;
166 
167 	ret = 0;
168 
169 	/* Invalid queue? */
170 	if (viornd.cfg.queue_notify > 0)
171 		return (0);
172 
173 	vq_info = &viornd.vq[viornd.cfg.queue_notify];
174 	vr = vq_info->q_hva;
175 	if (vr == NULL)
176 		fatalx("%s: null vring", __func__);
177 
178 	desc = (struct vring_desc *)(vr);
179 	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
180 	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
181 
182 	aidx = avail->idx & VIORND_QUEUE_MASK;
183 	uidx = used->idx & VIORND_QUEUE_MASK;
184 
185 	dxx = avail->ring[aidx] & VIORND_QUEUE_MASK;
186 
187 	sz = desc[dxx].len;
188 	if (sz > MAXPHYS)
189 		fatalx("viornd descriptor size too large (%zu)", sz);
190 
191 	rnd_data = malloc(sz);
192 
193 	if (rnd_data != NULL) {
194 		arc4random_buf(rnd_data, sz);
195 		if (write_mem(desc[dxx].addr, rnd_data, sz)) {
196 			log_warnx("viornd: can't write random data @ "
197 			    "0x%llx",
198 			    desc[dxx].addr);
199 		} else {
200 			/* ret == 1 -> interrupt needed */
201 			/* XXX check VIRTIO_F_NO_INTR */
202 			ret = 1;
203 			viornd.cfg.isr_status = 1;
204 			used->ring[uidx].id = dxx;
205 			used->ring[uidx].len = sz;
206 			__sync_synchronize();
207 			used->idx++;
208 		}
209 		free(rnd_data);
210 	} else
211 		fatal("memory allocation error for viornd data");
212 
213 	return (ret);
214 }
215 
216 int
217 virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
218     void *unused, uint8_t sz)
219 {
220 	*intr = 0xFF;
221 
222 	if (dir == 0) {
223 		switch (reg) {
224 		case VIRTIO_CONFIG_DEVICE_FEATURES:
225 		case VIRTIO_CONFIG_QUEUE_SIZE:
226 		case VIRTIO_CONFIG_ISR_STATUS:
227 			log_warnx("%s: illegal write %x to %s",
228 			    __progname, *data, virtio_reg_name(reg));
229 			break;
230 		case VIRTIO_CONFIG_GUEST_FEATURES:
231 			viornd.cfg.guest_feature = *data;
232 			break;
233 		case VIRTIO_CONFIG_QUEUE_PFN:
234 			viornd.cfg.queue_pfn = *data;
235 			viornd_update_qa();
236 			break;
237 		case VIRTIO_CONFIG_QUEUE_SELECT:
238 			viornd.cfg.queue_select = *data;
239 			viornd_update_qs();
240 			break;
241 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
242 			viornd.cfg.queue_notify = *data;
243 			if (viornd_notifyq())
244 				*intr = 1;
245 			break;
246 		case VIRTIO_CONFIG_DEVICE_STATUS:
247 			viornd.cfg.device_status = *data;
248 			break;
249 		}
250 	} else {
251 		switch (reg) {
252 		case VIRTIO_CONFIG_DEVICE_FEATURES:
253 			*data = viornd.cfg.device_feature;
254 			break;
255 		case VIRTIO_CONFIG_GUEST_FEATURES:
256 			*data = viornd.cfg.guest_feature;
257 			break;
258 		case VIRTIO_CONFIG_QUEUE_PFN:
259 			*data = viornd.cfg.queue_pfn;
260 			break;
261 		case VIRTIO_CONFIG_QUEUE_SIZE:
262 			*data = viornd.cfg.queue_size;
263 			break;
264 		case VIRTIO_CONFIG_QUEUE_SELECT:
265 			*data = viornd.cfg.queue_select;
266 			break;
267 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
268 			*data = viornd.cfg.queue_notify;
269 			break;
270 		case VIRTIO_CONFIG_DEVICE_STATUS:
271 			*data = viornd.cfg.device_status;
272 			break;
273 		case VIRTIO_CONFIG_ISR_STATUS:
274 			*data = viornd.cfg.isr_status;
275 			viornd.cfg.isr_status = 0;
276 			vcpu_deassert_pic_irq(viornd.vm_id, 0, viornd.irq);
277 			break;
278 		}
279 	}
280 	return (0);
281 }
282 
283 int
284 vmmci_ctl(unsigned int cmd)
285 {
286 	struct timeval tv = { 0, 0 };
287 
288 	if ((vmmci.cfg.device_status &
289 	    VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0)
290 		return (-1);
291 
292 	if (cmd == vmmci.cmd)
293 		return (0);
294 
295 	switch (cmd) {
296 	case VMMCI_NONE:
297 		break;
298 	case VMMCI_SHUTDOWN:
299 	case VMMCI_REBOOT:
300 		/* Update command */
301 		vmmci.cmd = cmd;
302 
303 		/*
304 		 * vmm VMs do not support powerdown, send a reboot request
305 		 * instead and turn it off after the triple fault.
306 		 */
307 		if (cmd == VMMCI_SHUTDOWN)
308 			cmd = VMMCI_REBOOT;
309 
310 		/* Trigger interrupt */
311 		vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
312 		vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
313 
314 		/* Add ACK timeout */
315 		tv.tv_sec = VMMCI_TIMEOUT;
316 		evtimer_add(&vmmci.timeout, &tv);
317 		break;
318 	case VMMCI_SYNCRTC:
319 		if (vmmci.cfg.guest_feature & VMMCI_F_SYNCRTC) {
320 			/* RTC updated, request guest VM resync of its RTC */
321 			vmmci.cmd = cmd;
322 
323 			vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
324 			vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
325 		} else {
326 			log_debug("%s: RTC sync skipped (guest does not "
327 			    "support RTC sync)\n", __func__);
328 		}
329 		break;
330 	default:
331 		fatalx("invalid vmmci command: %d", cmd);
332 	}
333 
334 	return (0);
335 }
336 
337 void
338 vmmci_ack(unsigned int cmd)
339 {
340 	struct timeval	 tv = { 0, 0 };
341 
342 	switch (cmd) {
343 	case VMMCI_NONE:
344 		break;
345 	case VMMCI_SHUTDOWN:
346 		/*
347 		 * The shutdown was requested by the VM if we don't have
348 		 * a pending shutdown request.  In this case add a short
349 		 * timeout to give the VM a chance to reboot before the
350 		 * timer is expired.
351 		 */
352 		if (vmmci.cmd == 0) {
353 			log_debug("%s: vm %u requested shutdown", __func__,
354 			    vmmci.vm_id);
355 			tv.tv_sec = VMMCI_TIMEOUT;
356 			evtimer_add(&vmmci.timeout, &tv);
357 			return;
358 		}
359 		/* FALLTHROUGH */
360 	case VMMCI_REBOOT:
361 		/*
362 		 * If the VM acknowledged our shutdown request, give it
363 		 * enough time to shutdown or reboot gracefully.  This
364 		 * might take a considerable amount of time (running
365 		 * rc.shutdown on the VM), so increase the timeout before
366 		 * killing it forcefully.
367 		 */
368 		if (cmd == vmmci.cmd &&
369 		    evtimer_pending(&vmmci.timeout, NULL)) {
370 			log_debug("%s: vm %u acknowledged shutdown request",
371 			    __func__, vmmci.vm_id);
372 			tv.tv_sec = VMMCI_SHUTDOWN_TIMEOUT;
373 			evtimer_add(&vmmci.timeout, &tv);
374 		}
375 		break;
376 	case VMMCI_SYNCRTC:
377 		log_debug("%s: vm %u acknowledged RTC sync request",
378 		    __func__, vmmci.vm_id);
379 		vmmci.cmd = VMMCI_NONE;
380 		break;
381 	default:
382 		log_warnx("%s: illegal request %u", __func__, cmd);
383 		break;
384 	}
385 }
386 
387 void
388 vmmci_timeout(int fd, short type, void *arg)
389 {
390 	log_debug("%s: vm %u shutdown", __progname, vmmci.vm_id);
391 	vm_shutdown(vmmci.cmd == VMMCI_REBOOT ? VMMCI_REBOOT : VMMCI_SHUTDOWN);
392 }
393 
394 int
395 vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
396     void *unused, uint8_t sz)
397 {
398 	*intr = 0xFF;
399 
400 	if (dir == 0) {
401 		switch (reg) {
402 		case VIRTIO_CONFIG_DEVICE_FEATURES:
403 		case VIRTIO_CONFIG_QUEUE_SIZE:
404 		case VIRTIO_CONFIG_ISR_STATUS:
405 			log_warnx("%s: illegal write %x to %s",
406 			    __progname, *data, virtio_reg_name(reg));
407 			break;
408 		case VIRTIO_CONFIG_GUEST_FEATURES:
409 			vmmci.cfg.guest_feature = *data;
410 			break;
411 		case VIRTIO_CONFIG_QUEUE_PFN:
412 			vmmci.cfg.queue_pfn = *data;
413 			break;
414 		case VIRTIO_CONFIG_QUEUE_SELECT:
415 			vmmci.cfg.queue_select = *data;
416 			break;
417 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
418 			vmmci.cfg.queue_notify = *data;
419 			break;
420 		case VIRTIO_CONFIG_DEVICE_STATUS:
421 			vmmci.cfg.device_status = *data;
422 			break;
423 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
424 			vmmci_ack(*data);
425 			break;
426 		}
427 	} else {
428 		switch (reg) {
429 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
430 			*data = vmmci.cmd;
431 			break;
432 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
433 			/* Update time once when reading the first register */
434 			gettimeofday(&vmmci.time, NULL);
435 			*data = (uint64_t)vmmci.time.tv_sec;
436 			break;
437 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
438 			*data = (uint64_t)vmmci.time.tv_sec << 32;
439 			break;
440 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12:
441 			*data = (uint64_t)vmmci.time.tv_usec;
442 			break;
443 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16:
444 			*data = (uint64_t)vmmci.time.tv_usec << 32;
445 			break;
446 		case VIRTIO_CONFIG_DEVICE_FEATURES:
447 			*data = vmmci.cfg.device_feature;
448 			break;
449 		case VIRTIO_CONFIG_GUEST_FEATURES:
450 			*data = vmmci.cfg.guest_feature;
451 			break;
452 		case VIRTIO_CONFIG_QUEUE_PFN:
453 			*data = vmmci.cfg.queue_pfn;
454 			break;
455 		case VIRTIO_CONFIG_QUEUE_SIZE:
456 			*data = vmmci.cfg.queue_size;
457 			break;
458 		case VIRTIO_CONFIG_QUEUE_SELECT:
459 			*data = vmmci.cfg.queue_select;
460 			break;
461 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
462 			*data = vmmci.cfg.queue_notify;
463 			break;
464 		case VIRTIO_CONFIG_DEVICE_STATUS:
465 			*data = vmmci.cfg.device_status;
466 			break;
467 		case VIRTIO_CONFIG_ISR_STATUS:
468 			*data = vmmci.cfg.isr_status;
469 			vmmci.cfg.isr_status = 0;
470 			vcpu_deassert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
471 			break;
472 		}
473 	}
474 	return (0);
475 }
476 
477 int
478 virtio_get_base(int fd, char *path, size_t npath, int type, const char *dpath)
479 {
480 	switch (type) {
481 	case VMDF_RAW:
482 		return 0;
483 	case VMDF_QCOW2:
484 		return virtio_qcow2_get_base(fd, path, npath, dpath);
485 	}
486 	log_warnx("%s: invalid disk format", __func__);
487 	return -1;
488 }
489 
490 void
491 virtio_init(struct vmd_vm *vm, int child_cdrom,
492     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
493 {
494 	struct vmop_create_params *vmc = &vm->vm_params;
495 	struct vm_create_params *vcp = &vmc->vmc_params;
496 	struct virtio_dev *dev;
497 	uint8_t id;
498 	uint8_t i, j;
499 
500 	/* Virtio entropy device */
501 	if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
502 	    PCI_PRODUCT_QUMRANET_VIO_RNG, PCI_CLASS_SYSTEM,
503 	    PCI_SUBCLASS_SYSTEM_MISC,
504 	    PCI_VENDOR_OPENBSD,
505 	    PCI_PRODUCT_VIRTIO_ENTROPY, 1, NULL)) {
506 		log_warnx("%s: can't add PCI virtio rng device",
507 		    __progname);
508 		return;
509 	}
510 
511 	if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_rnd_io, NULL)) {
512 		log_warnx("%s: can't add bar for virtio rng device",
513 		    __progname);
514 		return;
515 	}
516 
517 	memset(&viornd, 0, sizeof(viornd));
518 	viornd.vq[0].qs = VIORND_QUEUE_SIZE;
519 	viornd.vq[0].vq_availoffset = sizeof(struct vring_desc) *
520 	    VIORND_QUEUE_SIZE;
521 	viornd.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
522 	    sizeof(struct vring_desc) * VIORND_QUEUE_SIZE
523 	    + sizeof(uint16_t) * (2 + VIORND_QUEUE_SIZE));
524 	viornd.pci_id = id;
525 	viornd.irq = pci_get_dev_irq(id);
526 	viornd.vm_id = vcp->vcp_id;
527 
528 	SLIST_INIT(&virtio_devs);
529 
530 	if (vmc->vmc_nnics > 0) {
531 		for (i = 0; i < vmc->vmc_nnics; i++) {
532 			dev = calloc(1, sizeof(struct virtio_dev));
533 			if (dev == NULL) {
534 				log_warn("%s: calloc failure allocating vionet",
535 				    __progname);
536 				return;
537 			}
538 			/* Virtio network */
539 			dev->dev_type = VMD_DEVTYPE_NET;
540 
541 			if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
542 				PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM,
543 				PCI_SUBCLASS_SYSTEM_MISC, PCI_VENDOR_OPENBSD,
544 				PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL)) {
545 				log_warnx("%s: can't add PCI virtio net device",
546 				    __progname);
547 				return;
548 			}
549 			dev->pci_id = id;
550 			dev->sync_fd = -1;
551 			dev->async_fd = -1;
552 			dev->vm_id = vcp->vcp_id;
553 			dev->vm_vmid = vm->vm_vmid;
554 			dev->irq = pci_get_dev_irq(id);
555 
556 			/* The vionet pci bar function is called by the vcpu. */
557 			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
558 			    dev)) {
559 				log_warnx("%s: can't add bar for virtio net "
560 				    "device", __progname);
561 				return;
562 			}
563 
564 			dev->vionet.vq[RXQ].qs = VIONET_QUEUE_SIZE;
565 			dev->vionet.vq[RXQ].vq_availoffset =
566 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
567 			dev->vionet.vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
568 				sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
569 				+ sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
570 			dev->vionet.vq[RXQ].last_avail = 0;
571 			dev->vionet.vq[RXQ].notified_avail = 0;
572 
573 			dev->vionet.vq[TXQ].qs = VIONET_QUEUE_SIZE;
574 			dev->vionet.vq[TXQ].vq_availoffset =
575 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
576 			dev->vionet.vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
577 				sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
578 				+ sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
579 			dev->vionet.vq[TXQ].last_avail = 0;
580 			dev->vionet.vq[TXQ].notified_avail = 0;
581 
582 			dev->vionet.data_fd = child_taps[i];
583 
584 			/* MAC address has been assigned by the parent */
585 			memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6);
586 			dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC;
587 
588 			dev->vionet.lockedmac =
589 			    vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0;
590 			dev->vionet.local =
591 			    vmc->vmc_ifflags[i] & VMIFF_LOCAL ? 1 : 0;
592 			if (i == 0 && vmc->vmc_bootdevice & VMBOOTDEV_NET)
593 				dev->vionet.pxeboot = 1;
594 
595 			log_debug("%s: vm \"%s\" vio%u lladdr %s%s%s%s",
596 			    __func__, vcp->vcp_name, i,
597 			    ether_ntoa((void *)dev->vionet.mac),
598 			    dev->vionet.lockedmac ? ", locked" : "",
599 			    dev->vionet.local ? ", local" : "",
600 			    dev->vionet.pxeboot ? ", pxeboot" : "");
601 
602 			/* Add the vionet to our device list. */
603 			dev->vionet.idx = i;
604 			SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
605 		}
606 	}
607 
608 	if (vmc->vmc_ndisks > 0) {
609 		for (i = 0; i < vmc->vmc_ndisks; i++) {
610 			dev = calloc(1, sizeof(struct virtio_dev));
611 			if (dev == NULL) {
612 				log_warn("%s: calloc failure allocating vioblk",
613 				    __progname);
614 				return;
615 			}
616 
617 			/* One vioblk device for each disk defined in vcp */
618 			dev->dev_type = VMD_DEVTYPE_DISK;
619 
620 			if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
621 			    PCI_PRODUCT_QUMRANET_VIO_BLOCK,
622 			    PCI_CLASS_MASS_STORAGE,
623 			    PCI_SUBCLASS_MASS_STORAGE_SCSI,
624 			    PCI_VENDOR_OPENBSD,
625 			    PCI_PRODUCT_VIRTIO_BLOCK, 1, NULL)) {
626 				log_warnx("%s: can't add PCI virtio block "
627 				    "device", __progname);
628 				return;
629 			}
630 			dev->pci_id = id;
631 			dev->sync_fd = -1;
632 			dev->async_fd = -1;
633 			dev->vm_id = vcp->vcp_id;
634 			dev->vm_vmid = vm->vm_vmid;
635 			dev->irq = pci_get_dev_irq(id);
636 
637 			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
638 			    &dev->vioblk)) {
639 				log_warnx("%s: can't add bar for virtio block "
640 				    "device", __progname);
641 				return;
642 			}
643 			dev->vioblk.vq[0].qs = VIOBLK_QUEUE_SIZE;
644 			dev->vioblk.vq[0].vq_availoffset =
645 			    sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE;
646 			dev->vioblk.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
647 			    sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE
648 			    + sizeof(uint16_t) * (2 + VIOBLK_QUEUE_SIZE));
649 			dev->vioblk.vq[0].last_avail = 0;
650 			dev->vioblk.cfg.device_feature =
651 			    VIRTIO_BLK_F_SIZE_MAX;
652 			dev->vioblk.max_xfer = 1048576;
653 
654 			/*
655 			 * Initialize disk fds to an invalid fd (-1), then
656 			 * set any child disk fds.
657 			 */
658 			memset(&dev->vioblk.disk_fd, -1,
659 			    sizeof(dev->vioblk.disk_fd));
660 			dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i];
661 			for (j = 0; j < dev->vioblk.ndisk_fd; j++)
662 				dev->vioblk.disk_fd[j] = child_disks[i][j];
663 
664 			dev->vioblk.idx = i;
665 			SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
666 		}
667 	}
668 
669 	/*
670 	 * Launch virtio devices that support subprocess execution.
671 	 */
672 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
673 		if (virtio_dev_launch(vm, dev) != 0)
674 			fatalx("failed to launch virtio device");
675 	}
676 
677 	/* vioscsi cdrom */
678 	if (strlen(vmc->vmc_cdrom)) {
679 		vioscsi = calloc(1, sizeof(struct vioscsi_dev));
680 		if (vioscsi == NULL) {
681 			log_warn("%s: calloc failure allocating vioscsi",
682 			    __progname);
683 			return;
684 		}
685 
686 		if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
687 		    PCI_PRODUCT_QUMRANET_VIO_SCSI,
688 		    PCI_CLASS_MASS_STORAGE,
689 		    PCI_SUBCLASS_MASS_STORAGE_SCSI,
690 		    PCI_VENDOR_OPENBSD,
691 		    PCI_PRODUCT_VIRTIO_SCSI, 1, NULL)) {
692 			log_warnx("%s: can't add PCI vioscsi device",
693 			    __progname);
694 			return;
695 		}
696 
697 		if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vioscsi_io, vioscsi)) {
698 			log_warnx("%s: can't add bar for vioscsi device",
699 			    __progname);
700 			return;
701 		}
702 
703 		for (i = 0; i < VIRTIO_MAX_QUEUES; i++) {
704 			vioscsi->vq[i].qs = VIOSCSI_QUEUE_SIZE;
705 			vioscsi->vq[i].vq_availoffset =
706 			    sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE;
707 			vioscsi->vq[i].vq_usedoffset = VIRTQUEUE_ALIGN(
708 			    sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE
709 			    + sizeof(uint16_t) * (2 + VIOSCSI_QUEUE_SIZE));
710 			vioscsi->vq[i].last_avail = 0;
711 		}
712 		if (virtio_raw_init(&vioscsi->file, &vioscsi->sz, &child_cdrom,
713 		    1) == -1) {
714 			log_warnx("%s: unable to determine iso format",
715 			    __func__);
716 			return;
717 		}
718 		vioscsi->locked = 0;
719 		vioscsi->lba = 0;
720 		vioscsi->n_blocks = vioscsi->sz >> 2; /* num of 2048 blocks in file */
721 		vioscsi->max_xfer = VIOSCSI_BLOCK_SIZE_CDROM;
722 		vioscsi->pci_id = id;
723 		vioscsi->vm_id = vcp->vcp_id;
724 		vioscsi->irq = pci_get_dev_irq(id);
725 	}
726 
727 	/* virtio control device */
728 	if (pci_add_device(&id, PCI_VENDOR_OPENBSD,
729 	    PCI_PRODUCT_OPENBSD_CONTROL,
730 	    PCI_CLASS_COMMUNICATIONS,
731 	    PCI_SUBCLASS_COMMUNICATIONS_MISC,
732 	    PCI_VENDOR_OPENBSD,
733 	    PCI_PRODUCT_VIRTIO_VMMCI, 1, NULL)) {
734 		log_warnx("%s: can't add PCI vmm control device",
735 		    __progname);
736 		return;
737 	}
738 
739 	if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vmmci_io, NULL)) {
740 		log_warnx("%s: can't add bar for vmm control device",
741 		    __progname);
742 		return;
743 	}
744 
745 	memset(&vmmci, 0, sizeof(vmmci));
746 	vmmci.cfg.device_feature = VMMCI_F_TIMESYNC | VMMCI_F_ACK |
747 	    VMMCI_F_SYNCRTC;
748 	vmmci.vm_id = vcp->vcp_id;
749 	vmmci.irq = pci_get_dev_irq(id);
750 	vmmci.pci_id = id;
751 
752 	evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
753 }
754 
755 /*
756  * vionet_set_hostmac
757  *
758  * Sets the hardware address for the host-side tap(4) on a vionet_dev.
759  *
760  * This should only be called from the event-loop thread
761  *
762  * vm: pointer to the current vmd_vm instance
763  * idx: index into the array of vionet_dev's for the target vionet_dev
764  * addr: ethernet address to set
765  */
766 void
767 vionet_set_hostmac(struct vmd_vm *vm, unsigned int idx, uint8_t *addr)
768 {
769 	struct vmop_create_params	*vmc = &vm->vm_params;
770 	struct virtio_dev		*dev;
771 	struct vionet_dev		*vionet = NULL;
772 	int ret;
773 
774 	if (idx > vmc->vmc_nnics)
775 		fatalx("%s: invalid vionet index: %u", __func__, idx);
776 
777 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
778 		if (dev->dev_type == VMD_DEVTYPE_NET
779 		    && dev->vionet.idx == idx) {
780 			vionet = &dev->vionet;
781 			break;
782 		}
783 	}
784 	if (vionet == NULL)
785 		fatalx("%s: dev == NULL, idx = %u", __func__, idx);
786 
787 	/* Set the local vm process copy. */
788 	memcpy(vionet->hostmac, addr, sizeof(vionet->hostmac));
789 
790 	/* Send the information to the device process. */
791 	ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_HOSTMAC, 0, 0, -1,
792 	    vionet->hostmac, sizeof(vionet->hostmac));
793 	if (ret == -1) {
794 		log_warnx("%s: failed to queue hostmac to vionet dev %u",
795 		    __func__, idx);
796 		return;
797 	}
798 }
799 
800 void
801 virtio_shutdown(struct vmd_vm *vm)
802 {
803 	int ret, status;
804 	pid_t pid = 0;
805 	struct virtio_dev *dev, *tmp;
806 	struct viodev_msg msg;
807 	struct imsgbuf *ibuf;
808 
809 	/* Ensure that our disks are synced. */
810 	if (vioscsi != NULL)
811 		vioscsi->file.close(vioscsi->file.p, 0);
812 
813 	/*
814 	 * Broadcast shutdown to child devices. We need to do this
815 	 * synchronously as we have already stopped the async event thread.
816 	 */
817 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
818 		memset(&msg, 0, sizeof(msg));
819 		msg.type = VIODEV_MSG_SHUTDOWN;
820 		ibuf = &dev->sync_iev.ibuf;
821 		ret = imsg_compose(ibuf, VIODEV_MSG_SHUTDOWN, 0, 0, -1,
822 		    &msg, sizeof(msg));
823 		if (ret == -1)
824 			fatalx("%s: failed to send shutdown to device",
825 			    __func__);
826 		if (imsg_flush(ibuf) == -1)
827 			fatalx("%s: imsg_flush", __func__);
828 	}
829 
830 	/*
831 	 * Wait for all children to shutdown using a simple approach of
832 	 * iterating over known child devices and waiting for them to die.
833 	 */
834 	SLIST_FOREACH_SAFE(dev, &virtio_devs, dev_next, tmp) {
835 		log_debug("%s: waiting on device pid %d", __func__,
836 		    dev->dev_pid);
837 		do {
838 			pid = waitpid(dev->dev_pid, &status, WNOHANG);
839 		} while (pid == 0 || (pid == -1 && errno == EINTR));
840 		if (pid == dev->dev_pid)
841 			log_debug("%s: device for pid %d is stopped",
842 			    __func__, pid);
843 		else
844 			log_warnx("%s: unexpected pid %d", __func__, pid);
845 		free(dev);
846 	}
847 }
848 
849 int
850 vmmci_restore(int fd, uint32_t vm_id)
851 {
852 	log_debug("%s: receiving vmmci", __func__);
853 	if (atomicio(read, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
854 		log_warnx("%s: error reading vmmci from fd", __func__);
855 		return (-1);
856 	}
857 
858 	if (pci_set_bar_fn(vmmci.pci_id, 0, vmmci_io, NULL)) {
859 		log_warnx("%s: can't set bar fn for vmm control device",
860 		    __progname);
861 		return (-1);
862 	}
863 	vmmci.vm_id = vm_id;
864 	vmmci.irq = pci_get_dev_irq(vmmci.pci_id);
865 	memset(&vmmci.timeout, 0, sizeof(struct event));
866 	evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
867 	return (0);
868 }
869 
870 int
871 viornd_restore(int fd, struct vmd_vm *vm)
872 {
873 	void *hva = NULL;
874 
875 	log_debug("%s: receiving viornd", __func__);
876 	if (atomicio(read, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
877 		log_warnx("%s: error reading viornd from fd", __func__);
878 		return (-1);
879 	}
880 	if (pci_set_bar_fn(viornd.pci_id, 0, virtio_rnd_io, NULL)) {
881 		log_warnx("%s: can't set bar fn for virtio rng device",
882 		    __progname);
883 		return (-1);
884 	}
885 	viornd.vm_id = vm->vm_params.vmc_params.vcp_id;
886 	viornd.irq = pci_get_dev_irq(viornd.pci_id);
887 
888 	hva = hvaddr_mem(viornd.vq[0].q_gpa, vring_size(VIORND_QUEUE_SIZE));
889 	if (hva == NULL)
890 		fatal("failed to restore viornd virtqueue");
891 	viornd.vq[0].q_hva = hva;
892 
893 	return (0);
894 }
895 
896 int
897 vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
898 {
899 	struct vmop_create_params *vmc = &vm->vm_params;
900 	struct vm_create_params *vcp = &vmc->vmc_params;
901 	struct virtio_dev *dev;
902 	uint8_t i;
903 
904 	if (vmc->vmc_nnics == 0)
905 		return (0);
906 
907 	for (i = 0; i < vmc->vmc_nnics; i++) {
908 		dev = calloc(1, sizeof(struct virtio_dev));
909 		if (dev == NULL) {
910 			log_warn("%s: calloc failure allocating vionet",
911 			    __progname);
912 			return (-1);
913 		}
914 
915 		log_debug("%s: receiving virtio network device", __func__);
916 		if (atomicio(read, fd, dev, sizeof(struct virtio_dev))
917 		    != sizeof(struct virtio_dev)) {
918 			log_warnx("%s: error reading vionet from fd",
919 			    __func__);
920 			return (-1);
921 		}
922 
923 		/* Virtio network */
924 		if (dev->dev_type != VMD_DEVTYPE_NET) {
925 			log_warnx("%s: invalid device type", __func__);
926 			return (-1);
927 		}
928 
929 		dev->sync_fd = -1;
930 		dev->async_fd = -1;
931 		dev->vm_id = vcp->vcp_id;
932 		dev->vm_vmid = vm->vm_vmid;
933 		dev->irq = pci_get_dev_irq(dev->pci_id);
934 
935 		if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) {
936 			log_warnx("%s: can't set bar fn for virtio net "
937 			    "device", __progname);
938 			return (-1);
939 		}
940 
941 		dev->vionet.data_fd = child_taps[i];
942 		dev->vionet.idx = i;
943 
944 		SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
945 	}
946 
947 	return (0);
948 }
949 
950 int
951 vioblk_restore(int fd, struct vmd_vm *vm,
952     int child_disks[][VM_MAX_BASE_PER_DISK])
953 {
954 	struct vmop_create_params *vmc = &vm->vm_params;
955 	struct virtio_dev *dev;
956 	uint8_t i, j;
957 
958 	if (vmc->vmc_ndisks == 0)
959 		return (0);
960 
961 	for (i = 0; i < vmc->vmc_ndisks; i++) {
962 		dev = calloc(1, sizeof(struct virtio_dev));
963 		if (dev == NULL) {
964 			log_warn("%s: calloc failure allocating vioblks",
965 			    __progname);
966 			return (-1);
967 		}
968 
969 		log_debug("%s: receiving vioblk", __func__);
970 		if (atomicio(read, fd, dev, sizeof(struct virtio_dev))
971 		    != sizeof(struct virtio_dev)) {
972 			log_warnx("%s: error reading vioblk from fd", __func__);
973 			return (-1);
974 		}
975 		if (dev->dev_type != VMD_DEVTYPE_DISK) {
976 			log_warnx("%s: invalid device type", __func__);
977 			return (-1);
978 		}
979 
980 		dev->sync_fd = -1;
981 		dev->async_fd = -1;
982 
983 		if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) {
984 			log_warnx("%s: can't set bar fn for virtio block "
985 			    "device", __progname);
986 			return (-1);
987 		}
988 		dev->vm_id = vmc->vmc_params.vcp_id;
989 		dev->irq = pci_get_dev_irq(dev->pci_id);
990 
991 		memset(&dev->vioblk.disk_fd, -1, sizeof(dev->vioblk.disk_fd));
992 		dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i];
993 		for (j = 0; j < dev->vioblk.ndisk_fd; j++)
994 			dev->vioblk.disk_fd[j] = child_disks[i][j];
995 
996 		dev->vioblk.idx = i;
997 		SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
998 	}
999 	return (0);
1000 }
1001 
1002 int
1003 vioscsi_restore(int fd, struct vmd_vm *vm, int child_cdrom)
1004 {
1005 	void *hva = NULL;
1006 	unsigned int i;
1007 
1008 	if (!strlen(vm->vm_params.vmc_cdrom))
1009 		return (0);
1010 
1011 	vioscsi = calloc(1, sizeof(struct vioscsi_dev));
1012 	if (vioscsi == NULL) {
1013 		log_warn("%s: calloc failure allocating vioscsi", __progname);
1014 		return (-1);
1015 	}
1016 
1017 	log_debug("%s: receiving vioscsi", __func__);
1018 
1019 	if (atomicio(read, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
1020 	    sizeof(struct vioscsi_dev)) {
1021 		log_warnx("%s: error reading vioscsi from fd", __func__);
1022 		return (-1);
1023 	}
1024 
1025 	if (pci_set_bar_fn(vioscsi->pci_id, 0, vioscsi_io, vioscsi)) {
1026 		log_warnx("%s: can't set bar fn for vmm control device",
1027 		    __progname);
1028 		return (-1);
1029 	}
1030 
1031 	vioscsi->vm_id = vm->vm_params.vmc_params.vcp_id;
1032 	vioscsi->irq = pci_get_dev_irq(vioscsi->pci_id);
1033 
1034 	/* vioscsi uses 3 virtqueues. */
1035 	for (i = 0; i < 3; i++) {
1036 		hva = hvaddr_mem(vioscsi->vq[i].q_gpa,
1037 		    vring_size(VIOSCSI_QUEUE_SIZE));
1038 		if (hva == NULL)
1039 			fatal("failed to restore vioscsi virtqueue");
1040 		vioscsi->vq[i].q_hva = hva;
1041 	}
1042 
1043 	return (0);
1044 }
1045 
1046 int
1047 virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
1048     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1049 {
1050 	struct virtio_dev *dev;
1051 	int ret;
1052 
1053 	SLIST_INIT(&virtio_devs);
1054 
1055 	if ((ret = viornd_restore(fd, vm)) == -1)
1056 		return (ret);
1057 
1058 	if ((ret = vioblk_restore(fd, vm, child_disks)) == -1)
1059 		return (ret);
1060 
1061 	if ((ret = vioscsi_restore(fd, vm, child_cdrom)) == -1)
1062 		return (ret);
1063 
1064 	if ((ret = vionet_restore(fd, vm, child_taps)) == -1)
1065 		return (ret);
1066 
1067 	if ((ret = vmmci_restore(fd, vm->vm_params.vmc_params.vcp_id)) == -1)
1068 		return (ret);
1069 
1070 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
1071 		if (virtio_dev_launch(vm, dev) != 0)
1072 			fatalx("%s: failed to restore virtio dev", __func__);
1073 	}
1074 
1075 	return (0);
1076 }
1077 
1078 int
1079 viornd_dump(int fd)
1080 {
1081 	log_debug("%s: sending viornd", __func__);
1082 
1083 	viornd.vq[0].q_hva = NULL;
1084 
1085 	if (atomicio(vwrite, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
1086 		log_warnx("%s: error writing viornd to fd", __func__);
1087 		return (-1);
1088 	}
1089 	return (0);
1090 }
1091 
1092 int
1093 vmmci_dump(int fd)
1094 {
1095 	log_debug("%s: sending vmmci", __func__);
1096 
1097 	if (atomicio(vwrite, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
1098 		log_warnx("%s: error writing vmmci to fd", __func__);
1099 		return (-1);
1100 	}
1101 	return (0);
1102 }
1103 
1104 int
1105 vionet_dump(int fd)
1106 {
1107 	struct virtio_dev	*dev, temp;
1108 	struct viodev_msg	 msg;
1109 	struct imsg		 imsg;
1110 	struct imsgbuf		*ibuf = NULL;
1111 	size_t			 sz;
1112 	int			 ret;
1113 
1114 	log_debug("%s: dumping vionet", __func__);
1115 
1116 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
1117 		if (dev->dev_type != VMD_DEVTYPE_NET)
1118 			continue;
1119 
1120 		memset(&msg, 0, sizeof(msg));
1121 		memset(&imsg, 0, sizeof(imsg));
1122 
1123 		ibuf = &dev->sync_iev.ibuf;
1124 		msg.type = VIODEV_MSG_DUMP;
1125 
1126 		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
1127 		    sizeof(msg));
1128 		if (ret == -1) {
1129 			log_warnx("%s: failed requesting dump of vionet[%d]",
1130 			    __func__, dev->vionet.idx);
1131 			return (-1);
1132 		}
1133 		if (imsg_flush(ibuf) == -1) {
1134 			log_warnx("%s: imsg_flush", __func__);
1135 			return (-1);
1136 		}
1137 
1138 		sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp));
1139 		if (sz != sizeof(temp)) {
1140 			log_warnx("%s: failed to dump vionet[%d]", __func__,
1141 			    dev->vionet.idx);
1142 			return (-1);
1143 		}
1144 
1145 		temp.vionet.vq[RXQ].q_hva = NULL;
1146 		temp.vionet.vq[TXQ].q_hva = NULL;
1147 		temp.async_fd = -1;
1148 		temp.sync_fd = -1;
1149 		memset(&temp.async_iev, 0, sizeof(temp.async_iev));
1150 		memset(&temp.sync_iev, 0, sizeof(temp.sync_iev));
1151 
1152 		if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) {
1153 			log_warnx("%s: error writing vionet to fd", __func__);
1154 			return (-1);
1155 		}
1156 	}
1157 
1158 	return (0);
1159 }
1160 
1161 int
1162 vioblk_dump(int fd)
1163 {
1164 	struct virtio_dev	*dev, temp;
1165 	struct viodev_msg	 msg;
1166 	struct imsg		 imsg;
1167 	struct imsgbuf		*ibuf = NULL;
1168 	size_t			 sz;
1169 	int			 ret;
1170 
1171 	log_debug("%s: dumping vioblk", __func__);
1172 
1173 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
1174 		if (dev->dev_type != VMD_DEVTYPE_DISK)
1175 			continue;
1176 
1177 		memset(&msg, 0, sizeof(msg));
1178 		memset(&imsg, 0, sizeof(imsg));
1179 
1180 		ibuf = &dev->sync_iev.ibuf;
1181 		msg.type = VIODEV_MSG_DUMP;
1182 
1183 		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
1184 		    sizeof(msg));
1185 		if (ret == -1) {
1186 			log_warnx("%s: failed requesting dump of vioblk[%d]",
1187 			    __func__, dev->vioblk.idx);
1188 			return (-1);
1189 		}
1190 		if (imsg_flush(ibuf) == -1) {
1191 			log_warnx("%s: imsg_flush", __func__);
1192 			return (-1);
1193 		}
1194 
1195 
1196 		sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp));
1197 		if (sz != sizeof(temp)) {
1198 			log_warnx("%s: failed to dump vioblk[%d]", __func__,
1199 			    dev->vioblk.idx);
1200 			return (-1);
1201 		}
1202 
1203 		temp.vioblk.vq[0].q_hva = NULL;
1204 		temp.async_fd = -1;
1205 		temp.sync_fd = -1;
1206 		memset(&temp.async_iev, 0, sizeof(temp.async_iev));
1207 		memset(&temp.sync_iev, 0, sizeof(temp.sync_iev));
1208 
1209 		if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) {
1210 			log_warnx("%s: error writing vioblk to fd", __func__);
1211 			return (-1);
1212 		}
1213 	}
1214 
1215 	return (0);
1216 }
1217 
1218 int
1219 vioscsi_dump(int fd)
1220 {
1221 	unsigned int i;
1222 
1223 	if (vioscsi == NULL)
1224 		return (0);
1225 
1226 	log_debug("%s: sending vioscsi", __func__);
1227 
1228 	for (i = 0; i < 3; i++)
1229 		vioscsi->vq[i].q_hva = NULL;
1230 
1231 	if (atomicio(vwrite, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
1232 	    sizeof(struct vioscsi_dev)) {
1233 		log_warnx("%s: error writing vioscsi to fd", __func__);
1234 		return (-1);
1235 	}
1236 	return (0);
1237 }
1238 
1239 int
1240 virtio_dump(int fd)
1241 {
1242 	int ret;
1243 
1244 	if ((ret = viornd_dump(fd)) == -1)
1245 		return ret;
1246 
1247 	if ((ret = vioblk_dump(fd)) == -1)
1248 		return ret;
1249 
1250 	if ((ret = vioscsi_dump(fd)) == -1)
1251 		return ret;
1252 
1253 	if ((ret = vionet_dump(fd)) == -1)
1254 		return ret;
1255 
1256 	if ((ret = vmmci_dump(fd)) == -1)
1257 		return ret;
1258 
1259 	return (0);
1260 }
1261 
1262 void
1263 virtio_stop(struct vmd_vm *vm)
1264 {
1265 	struct virtio_dev *dev;
1266 	int ret;
1267 
1268 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
1269 		ret = imsg_compose_event(&dev->async_iev, IMSG_VMDOP_PAUSE_VM,
1270 		    0, 0, -1, NULL, 0);
1271 		if (ret == -1) {
1272 			log_warnx("%s: failed to compose pause msg to device",
1273 				__func__);
1274 		}
1275 	}
1276 }
1277 
1278 void
1279 virtio_start(struct vmd_vm *vm)
1280 {
1281 	struct virtio_dev *dev;
1282 	int ret;
1283 
1284 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
1285 		ret = imsg_compose_event(&dev->async_iev, IMSG_VMDOP_UNPAUSE_VM,
1286 		    0, 0, -1, NULL, 0);
1287 		if (ret == -1) {
1288 			log_warnx("%s: failed to compose start msg to device",
1289 			    __func__);
1290 		}
1291 	}
1292 }
1293 
1294 /*
1295  * Fork+exec a child virtio device. Returns 0 on success.
1296  */
1297 static int
1298 virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev)
1299 {
1300 	char *nargv[10], num[32], vmm_fd[32], t[2];
1301 	pid_t dev_pid;
1302 	int data_fds[VM_MAX_BASE_PER_DISK], sync_fds[2], async_fds[2], ret = 0;
1303 	size_t i, j, data_fds_sz, sz = 0;
1304 	struct virtio_dev *d = NULL;
1305 	struct viodev_msg msg;
1306 	struct imsg imsg;
1307 	struct imsgev *iev = &dev->sync_iev;
1308 
1309 	switch (dev->dev_type) {
1310 	case VMD_DEVTYPE_NET:
1311 		data_fds[0] = dev->vionet.data_fd;
1312 		data_fds_sz = 1;
1313 		log_debug("%s: launching vionet[%d]",
1314 		    vm->vm_params.vmc_params.vcp_name, dev->vionet.idx);
1315 		break;
1316 	case VMD_DEVTYPE_DISK:
1317 		memcpy(&data_fds, dev->vioblk.disk_fd, sizeof(data_fds));
1318 		data_fds_sz = dev->vioblk.ndisk_fd;
1319 		log_debug("%s: launching vioblk[%d]",
1320 		    vm->vm_params.vmc_params.vcp_name, dev->vioblk.idx);
1321 		break;
1322 		/* NOTREACHED */
1323 	default:
1324 		log_warn("%s: invalid device type", __func__);
1325 		return (EINVAL);
1326 	}
1327 
1328 	/* We need two channels: one synchronous (IO reads) and one async. */
1329 	if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, sync_fds) == -1) {
1330 		log_warn("failed to create socketpair");
1331 		return (errno);
1332 	}
1333 	if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, async_fds) == -1) {
1334 		log_warn("failed to create async socketpair");
1335 		return (errno);
1336 	}
1337 
1338 	/* Keep communication channels open after exec. */
1339 	if (fcntl(sync_fds[1], F_SETFD, 0)) {
1340 		ret = errno;
1341 		log_warn("%s: fcntl", __func__);
1342 		goto err;
1343 	}
1344 	if (fcntl(async_fds[1], F_SETFD, 0)) {
1345 		ret = errno;
1346 		log_warn("%s: fcnt", __func__);
1347 		goto err;
1348 	}
1349 
1350 	/* Keep data file descriptors open after exec. */
1351 	for (i = 0; i < data_fds_sz; i++) {
1352 		log_debug("%s: marking fd %d !close-on-exec", __func__,
1353 		    data_fds[i]);
1354 		if (fcntl(data_fds[i], F_SETFD, 0)) {
1355 			ret = errno;
1356 			log_warn("%s: fcntl", __func__);
1357 			goto err;
1358 		}
1359 	}
1360 
1361 	/* Fork... */
1362 	dev_pid = fork();
1363 	if (dev_pid == -1) {
1364 		ret = errno;
1365 		log_warn("%s: fork failed", __func__);
1366 		goto err;
1367 	}
1368 
1369 	if (dev_pid > 0) {
1370 		/* Parent */
1371 		close_fd(sync_fds[1]);
1372 		close_fd(async_fds[1]);
1373 
1374 		/* Save the child's pid to help with cleanup. */
1375 		dev->dev_pid = dev_pid;
1376 
1377 		/* Set the channel fds to the child's before sending. */
1378 		dev->sync_fd = sync_fds[1];
1379 		dev->async_fd = async_fds[1];
1380 
1381 		/* Close data fds. Only the child device needs them now. */
1382 		for (i = 0; i < data_fds_sz; i++)
1383 			close_fd(data_fds[i]);
1384 
1385 		/* Set our synchronous channel to non-blocking. */
1386 		if (fcntl(sync_fds[0], F_SETFL, O_NONBLOCK) == -1) {
1387 			ret = errno;
1388 			log_warn("%s: fcntl", __func__);
1389 			goto err;
1390 		}
1391 
1392 		/* 1. Send over our configured device. */
1393 		log_debug("%s: sending '%c' type device struct", __func__,
1394 			dev->dev_type);
1395 		sz = atomicio(vwrite, sync_fds[0], dev, sizeof(*dev));
1396 		if (sz != sizeof(*dev)) {
1397 			log_warnx("%s: failed to send device", __func__);
1398 			ret = EIO;
1399 			goto err;
1400 		}
1401 
1402 		/* 2. Send over details on the VM (including memory fds). */
1403 		log_debug("%s: sending vm message for '%s'", __func__,
1404 			vm->vm_params.vmc_params.vcp_name);
1405 		sz = atomicio(vwrite, sync_fds[0], vm, sizeof(*vm));
1406 		if (sz != sizeof(*vm)) {
1407 			log_warnx("%s: failed to send vm details", __func__);
1408 			ret = EIO;
1409 			goto err;
1410 		}
1411 
1412 		/*
1413 		 * Initialize our imsg channel to the child device. The initial
1414 		 * communication will be synchronous. We expect the child to
1415 		 * report itself "ready" to confirm the launch was a success.
1416 		 */
1417 		imsg_init(&iev->ibuf, sync_fds[0]);
1418 		do
1419 			ret = imsg_read(&iev->ibuf);
1420 		while (ret == -1 && errno == EAGAIN);
1421 		if (ret == 0 || ret == -1) {
1422 			log_warnx("%s: failed to receive ready message from "
1423 			    "'%c' type device", __func__, dev->dev_type);
1424 			ret = EIO;
1425 			goto err;
1426 		}
1427 		ret = 0;
1428 
1429 		log_debug("%s: receiving reply", __func__);
1430 		if (imsg_get(&iev->ibuf, &imsg) < 1) {
1431 			log_warnx("%s: imsg_get", __func__);
1432 			ret = EIO;
1433 			goto err;
1434 		}
1435 		IMSG_SIZE_CHECK(&imsg, &msg);
1436 		memcpy(&msg, imsg.data, sizeof(msg));
1437 		imsg_free(&imsg);
1438 
1439 		if (msg.type != VIODEV_MSG_READY) {
1440 			log_warnx("%s: expected ready message, got type %d",
1441 			    __func__, msg.type);
1442 			ret = EINVAL;
1443 			goto err;
1444 		}
1445 		log_debug("%s: device reports ready via sync channel",
1446 		    __func__);
1447 
1448 		/*
1449 		 * Wire in the async event handling, but after reverting back
1450 		 * to the parent's fd's.
1451 		 */
1452 		dev->sync_fd = sync_fds[0];
1453 		dev->async_fd = async_fds[0];
1454 		vm_device_pipe(dev, virtio_dispatch_dev);
1455 	} else {
1456 		/* Child */
1457 		close_fd(async_fds[0]);
1458 		close_fd(sync_fds[0]);
1459 
1460 		/*
1461 		 * Close any other device fd's we know aren't
1462 		 * ours. This releases any exclusive locks held on
1463 		 * things like disk images.
1464 		 */
1465 		SLIST_FOREACH(d, &virtio_devs, dev_next) {
1466 			if (d == dev)
1467 				continue;
1468 
1469 			switch (d->dev_type) {
1470 			case VMD_DEVTYPE_DISK:
1471 				for (j = 0; j < d->vioblk.ndisk_fd; j++)
1472 					close_fd(d->vioblk.disk_fd[j]);
1473 				break;
1474 			case VMD_DEVTYPE_NET:
1475 				close_fd(d->vionet.data_fd);
1476 				break;
1477 			default:
1478 				fatalx("%s: invalid device type '%c'",
1479 				    __func__, d->dev_type);
1480 			}
1481 		}
1482 
1483 		memset(&nargv, 0, sizeof(nargv));
1484 		memset(num, 0, sizeof(num));
1485 		snprintf(num, sizeof(num), "%d", sync_fds[1]);
1486 		memset(vmm_fd, 0, sizeof(vmm_fd));
1487 		snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd);
1488 
1489 		t[0] = dev->dev_type;
1490 		t[1] = '\0';
1491 
1492 		nargv[0] = env->argv0;
1493 		nargv[1] = "-X";
1494 		nargv[2] = num;
1495 		nargv[3] = "-t";
1496 		nargv[4] = t;
1497 		nargv[5] = "-i";
1498 		nargv[6] = vmm_fd;
1499 		nargv[7] = "-n";
1500 
1501 		if (env->vmd_verbose) {
1502 			nargv[8] = "-v";
1503 			nargv[9] = NULL;
1504 		} else
1505 			nargv[8] = NULL;
1506 
1507 		/* Control resumes in vmd.c:main(). */
1508 		execvp(nargv[0], nargv);
1509 
1510 		ret = errno;
1511 		log_warn("%s: failed to exec device", __func__);
1512 		_exit(ret);
1513 		/* NOTREACHED */
1514 	}
1515 
1516 	return (ret);
1517 
1518 err:
1519 	close_fd(sync_fds[0]);
1520 	close_fd(sync_fds[1]);
1521 	close_fd(async_fds[0]);
1522 	close_fd(async_fds[1]);
1523 	return (ret);
1524 }
1525 
1526 /*
1527  * Initialize an async imsg channel for a virtio device.
1528  */
1529 int
1530 vm_device_pipe(struct virtio_dev *dev, void (*cb)(int, short, void *))
1531 {
1532 	struct imsgev *iev = &dev->async_iev;
1533 	int fd = dev->async_fd;
1534 
1535 	log_debug("%s: initializing '%c' device pipe (fd=%d)", __func__,
1536 	    dev->dev_type, fd);
1537 
1538 	if (fcntl(fd, F_SETFL, O_NONBLOCK) == -1) {
1539 		log_warn("failed to set nonblocking mode on vm device pipe");
1540 		return (-1);
1541 	}
1542 
1543 	imsg_init(&iev->ibuf, fd);
1544 	iev->handler = cb;
1545 	iev->data = dev;
1546 	iev->events = EV_READ;
1547 	imsg_event_add(iev);
1548 
1549 	return (0);
1550 }
1551 
1552 void
1553 virtio_dispatch_dev(int fd, short event, void *arg)
1554 {
1555 	struct virtio_dev	*dev = (struct virtio_dev*)arg;
1556 	struct imsgev		*iev = &dev->async_iev;
1557 	struct imsgbuf		*ibuf = &iev->ibuf;
1558 	struct imsg		 imsg;
1559 	struct viodev_msg	 msg;
1560 	ssize_t			 n = 0;
1561 
1562 	if (event & EV_READ) {
1563 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
1564 			fatal("%s: imsg_read", __func__);
1565 		if (n == 0) {
1566 			/* this pipe is dead, so remove the event handler */
1567 			log_debug("%s: pipe dead (EV_READ)", __func__);
1568 			event_del(&iev->ev);
1569 			event_loopexit(NULL);
1570 			return;
1571 		}
1572 	}
1573 
1574 	if (event & EV_WRITE) {
1575 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
1576 			fatal("%s: msgbuf_write", __func__);
1577 		if (n == 0) {
1578 			/* this pipe is dead, so remove the event handler */
1579 			log_debug("%s: pipe dead (EV_WRITE)", __func__);
1580 			event_del(&iev->ev);
1581 			event_loopexit(NULL);
1582 			return;
1583 		}
1584 	}
1585 
1586 	for (;;) {
1587 		if ((n = imsg_get(ibuf, &imsg)) == -1)
1588 			fatal("%s: imsg_get", __func__);
1589 		if (n == 0)
1590 			break;
1591 
1592 		switch (imsg.hdr.type) {
1593 		case IMSG_DEVOP_MSG:
1594 			IMSG_SIZE_CHECK(&imsg, &msg);
1595 			memcpy(&msg, imsg.data, sizeof(msg));
1596 			handle_dev_msg(&msg, dev);
1597 			break;
1598 		default:
1599 			log_warnx("%s: got non devop imsg %d", __func__,
1600 			    imsg.hdr.type);
1601 			break;
1602 		}
1603 		imsg_free(&imsg);
1604 	}
1605 	imsg_event_add(iev);
1606 }
1607 
1608 
1609 static int
1610 handle_dev_msg(struct viodev_msg *msg, struct virtio_dev *gdev)
1611 {
1612 	uint32_t vm_id = gdev->vm_id;
1613 	int irq = gdev->irq;
1614 
1615 	switch (msg->type) {
1616 	case VIODEV_MSG_KICK:
1617 		if (msg->state == INTR_STATE_ASSERT)
1618 			vcpu_assert_pic_irq(vm_id, msg->vcpu, irq);
1619 		else if (msg->state == INTR_STATE_DEASSERT)
1620 			vcpu_deassert_pic_irq(vm_id, msg->vcpu, irq);
1621 		break;
1622 	case VIODEV_MSG_READY:
1623 		log_debug("%s: device reports ready", __func__);
1624 		break;
1625 	case VIODEV_MSG_ERROR:
1626 		log_warnx("%s: device reported error", __func__);
1627 		break;
1628 	case VIODEV_MSG_INVALID:
1629 	case VIODEV_MSG_IO_READ:
1630 	case VIODEV_MSG_IO_WRITE:
1631 		/* FALLTHROUGH */
1632 	default:
1633 		log_warnx("%s: unsupported device message type %d", __func__,
1634 		    msg->type);
1635 		return (1);
1636 	}
1637 
1638 	return (0);
1639 };
1640 
1641 /*
1642  * Called by the VM process while processing IO from the VCPU thread.
1643  *
1644  * N.b. Since the VCPU thread calls this function, we cannot mutate the event
1645  * system. All ipc messages must be sent manually and cannot be queued for
1646  * the event loop to push them. (We need to perform a synchronous read, so
1647  * this isn't really a big deal.)
1648  */
1649 int
1650 virtio_pci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
1651     void *cookie, uint8_t sz)
1652 {
1653 	struct virtio_dev *dev = (struct virtio_dev *)cookie;
1654 	struct imsgbuf *ibuf = &dev->sync_iev.ibuf;
1655 	struct imsg imsg;
1656 	struct viodev_msg msg;
1657 	ssize_t n;
1658 	int ret = 0;
1659 
1660 	memset(&msg, 0, sizeof(msg));
1661 	msg.reg = reg;
1662 	msg.io_sz = sz;
1663 
1664 	if (dir == 0) {
1665 		msg.type = VIODEV_MSG_IO_WRITE;
1666 		msg.data = *data;
1667 		msg.data_valid = 1;
1668 	} else
1669 		msg.type = VIODEV_MSG_IO_READ;
1670 
1671 	if (msg.type == VIODEV_MSG_IO_WRITE) {
1672 		/*
1673 		 * Write request. No reply expected.
1674 		 */
1675 		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
1676 		    sizeof(msg));
1677 		if (ret == -1) {
1678 			log_warn("%s: failed to send async io event to vionet"
1679 			    " device", __func__);
1680 			return (ret);
1681 		}
1682 		if (imsg_flush(ibuf) == -1) {
1683 			log_warnx("%s: imsg_flush (write)", __func__);
1684 			return (-1);
1685 		}
1686 	} else {
1687 		/*
1688 		 * Read request. Requires waiting for a reply.
1689 		 */
1690 		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
1691 		    sizeof(msg));
1692 		if (ret == -1) {
1693 			log_warnx("%s: failed to send sync io event to vionet"
1694 			    " device", __func__);
1695 			return (ret);
1696 		}
1697 		if (imsg_flush(ibuf) == -1) {
1698 			log_warnx("%s: imsg_flush (read)", __func__);
1699 			return (-1);
1700 		}
1701 
1702 		/* Read our reply. */
1703 		do
1704 			n = imsg_read(ibuf);
1705 		while (n == -1 && errno == EAGAIN);
1706 		if (n == 0 || n == -1) {
1707 			log_warn("%s: imsg_read (n=%ld)", __func__, n);
1708 			return (-1);
1709 		}
1710 		if ((n = imsg_get(ibuf, &imsg)) == -1) {
1711 			log_warn("%s: imsg_get (n=%ld)", __func__, n);
1712 			return (-1);
1713 		}
1714 		if (n == 0) {
1715 			log_warnx("%s: invalid imsg", __func__);
1716 			return (-1);
1717 		}
1718 
1719 		IMSG_SIZE_CHECK(&imsg, &msg);
1720 		memcpy(&msg, imsg.data, sizeof(msg));
1721 		imsg_free(&imsg);
1722 
1723 		if (msg.type == VIODEV_MSG_IO_READ && msg.data_valid) {
1724 			log_debug("%s: got sync read response (reg=%s)",
1725 			    __func__, virtio_reg_name(msg.reg));
1726 			*data = msg.data;
1727 			/*
1728 			 * It's possible we're asked to {de,}assert after the
1729 			 * device performs a register read.
1730 			 */
1731 			if (msg.state == INTR_STATE_ASSERT)
1732 				vcpu_assert_pic_irq(dev->vm_id, msg.vcpu, msg.irq);
1733 			else if (msg.state == INTR_STATE_DEASSERT)
1734 				vcpu_deassert_pic_irq(dev->vm_id, msg.vcpu, msg.irq);
1735 		} else {
1736 			log_warnx("%s: expected IO_READ, got %d", __func__,
1737 			    msg.type);
1738 			return (-1);
1739 		}
1740 	}
1741 
1742 	return (0);
1743 }
1744 
1745 void
1746 virtio_assert_pic_irq(struct virtio_dev *dev, int vcpu)
1747 {
1748 	struct viodev_msg msg;
1749 	int ret;
1750 
1751 	memset(&msg, 0, sizeof(msg));
1752 	msg.irq = dev->irq;
1753 	msg.vcpu = vcpu;
1754 	msg.type = VIODEV_MSG_KICK;
1755 	msg.state = INTR_STATE_ASSERT;
1756 
1757 	ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
1758 	    &msg, sizeof(msg));
1759 	if (ret == -1)
1760 		log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
1761 }
1762 
1763 void
1764 virtio_deassert_pic_irq(struct virtio_dev *dev, int vcpu)
1765 {
1766 	struct viodev_msg msg;
1767 	int ret;
1768 
1769 	memset(&msg, 0, sizeof(msg));
1770 	msg.irq = dev->irq;
1771 	msg.vcpu = vcpu;
1772 	msg.type = VIODEV_MSG_KICK;
1773 	msg.state = INTR_STATE_DEASSERT;
1774 
1775 	ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
1776 	    &msg, sizeof(msg));
1777 	if (ret == -1)
1778 		log_warnx("%s: failed to deassert irq %d", __func__, dev->irq);
1779 }
1780