xref: /openbsd-src/usr.sbin/vmd/virtio.c (revision 7ccb23ddbdd55de383a971a182d990cf044f966f)
1*7ccb23ddSdv /*	$OpenBSD: virtio.c,v 1.123 2025/01/08 15:46:10 dv Exp $	*/
215caf263Sreyk 
3f3c0184aSmlarkin /*
4f3c0184aSmlarkin  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5f3c0184aSmlarkin  *
6f3c0184aSmlarkin  * Permission to use, copy, modify, and distribute this software for any
7f3c0184aSmlarkin  * purpose with or without fee is hereby granted, provided that the above
8f3c0184aSmlarkin  * copyright notice and this permission notice appear in all copies.
9f3c0184aSmlarkin  *
10f3c0184aSmlarkin  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11f3c0184aSmlarkin  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12f3c0184aSmlarkin  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13f3c0184aSmlarkin  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14f3c0184aSmlarkin  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15f3c0184aSmlarkin  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16f3c0184aSmlarkin  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17f3c0184aSmlarkin  */
18f3c0184aSmlarkin 
1987302766Sderaadt #include <sys/param.h>	/* PAGE_SIZE */
202fca0731Sreyk #include <sys/socket.h>
213481ecdfSdv #include <sys/wait.h>
2287302766Sderaadt 
2387302766Sderaadt #include <dev/pci/pcireg.h>
2487302766Sderaadt #include <dev/pci/pcidevs.h>
251350bd58Sreyk #include <dev/pv/virtioreg.h>
2611b9cb3bSsf #include <dev/pci/virtio_pcireg.h>
271350bd58Sreyk #include <dev/pv/vioblkreg.h>
28ba66f564Sdv #include <dev/vmm/vmm.h>
2987302766Sderaadt 
302fca0731Sreyk #include <net/if.h>
312fca0731Sreyk #include <netinet/in.h>
322fca0731Sreyk #include <netinet/if_ether.h>
332fca0731Sreyk 
34f3c0184aSmlarkin #include <errno.h>
35c396b7b1Sstefan #include <event.h>
36f3c0184aSmlarkin #include <stdlib.h>
37f3c0184aSmlarkin #include <string.h>
38f3c0184aSmlarkin #include <unistd.h>
3987302766Sderaadt 
406eb4c859Sdv #include "atomicio.h"
41f3c0184aSmlarkin #include "pci.h"
426eb4c859Sdv #include "vioscsi.h"
436eb4c859Sdv #include "virtio.h"
44f3c0184aSmlarkin #include "vmd.h"
45f3c0184aSmlarkin 
463481ecdfSdv extern struct vmd *env;
47f3c0184aSmlarkin extern char *__progname;
483481ecdfSdv 
49f3c0184aSmlarkin struct viornd_dev viornd;
5095ab188fSccardenas struct vioscsi_dev *vioscsi;
51f84d5d33Sreyk struct vmmci_dev vmmci;
52f3c0184aSmlarkin 
533481ecdfSdv /* Devices emulated in subprocesses are inserted into this list. */
543481ecdfSdv SLIST_HEAD(virtio_dev_head, virtio_dev) virtio_devs;
55f3c0184aSmlarkin 
56f3c0184aSmlarkin #define MAXPHYS	(64 * 1024)	/* max raw I/O transfer size */
57f3c0184aSmlarkin 
58f3c0184aSmlarkin #define VIRTIO_NET_F_MAC	(1<<5)
59f3c0184aSmlarkin 
60981cad08Sreyk #define VMMCI_F_TIMESYNC	(1<<0)
613320a88dSreyk #define VMMCI_F_ACK		(1<<1)
62e82d5294Smlarkin #define VMMCI_F_SYNCRTC		(1<<2)
63981cad08Sreyk 
640686b443Smlarkin #define RXQ	0
650686b443Smlarkin #define TXQ	1
660686b443Smlarkin 
673481ecdfSdv static int virtio_dev_launch(struct vmd_vm *, struct virtio_dev *);
683481ecdfSdv static void virtio_dispatch_dev(int, short, void *);
693481ecdfSdv static int handle_dev_msg(struct viodev_msg *, struct virtio_dev *);
70b3bc6112Sdv static int virtio_dev_closefds(struct virtio_dev *);
71*7ccb23ddSdv static void vmmci_pipe_dispatch(int, short, void *);
72f3c0184aSmlarkin 
733481ecdfSdv const char *
74f3c0184aSmlarkin virtio_reg_name(uint8_t reg)
75f3c0184aSmlarkin {
76f3c0184aSmlarkin 	switch (reg) {
77f3c0184aSmlarkin 	case VIRTIO_CONFIG_DEVICE_FEATURES: return "device feature";
78f3c0184aSmlarkin 	case VIRTIO_CONFIG_GUEST_FEATURES: return "guest feature";
790bd10b9fSdv 	case VIRTIO_CONFIG_QUEUE_PFN: return "queue address";
80f3c0184aSmlarkin 	case VIRTIO_CONFIG_QUEUE_SIZE: return "queue size";
81f3c0184aSmlarkin 	case VIRTIO_CONFIG_QUEUE_SELECT: return "queue select";
82f3c0184aSmlarkin 	case VIRTIO_CONFIG_QUEUE_NOTIFY: return "queue notify";
83f3c0184aSmlarkin 	case VIRTIO_CONFIG_DEVICE_STATUS: return "device status";
84f3c0184aSmlarkin 	case VIRTIO_CONFIG_ISR_STATUS: return "isr status";
853481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI...VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
863481ecdfSdv 		return "device config 0";
873481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
883481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
893481ecdfSdv 		return "device config 1";
90f3c0184aSmlarkin 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: return "device config 2";
91981cad08Sreyk 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: return "device config 3";
92981cad08Sreyk 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: return "device config 4";
93f3c0184aSmlarkin 	default: return "unknown";
94f3c0184aSmlarkin 	}
95f3c0184aSmlarkin }
96f3c0184aSmlarkin 
97f3c0184aSmlarkin uint32_t
98f3c0184aSmlarkin vring_size(uint32_t vq_size)
99f3c0184aSmlarkin {
100f3c0184aSmlarkin 	uint32_t allocsize1, allocsize2;
101f3c0184aSmlarkin 
102f3c0184aSmlarkin 	/* allocsize1: descriptor table + avail ring + pad */
103f3c0184aSmlarkin 	allocsize1 = VIRTQUEUE_ALIGN(sizeof(struct vring_desc) * vq_size
104f3c0184aSmlarkin 	    + sizeof(uint16_t) * (2 + vq_size));
105f3c0184aSmlarkin 	/* allocsize2: used ring + pad */
106f3c0184aSmlarkin 	allocsize2 = VIRTQUEUE_ALIGN(sizeof(uint16_t) * 2
107f3c0184aSmlarkin 	    + sizeof(struct vring_used_elem) * vq_size);
108f3c0184aSmlarkin 
109f3c0184aSmlarkin 	return allocsize1 + allocsize2;
110f3c0184aSmlarkin }
111f3c0184aSmlarkin 
112f3c0184aSmlarkin /* Update queue select */
113f3c0184aSmlarkin void
114f3c0184aSmlarkin viornd_update_qs(void)
115f3c0184aSmlarkin {
1160bd10b9fSdv 	struct virtio_vq_info *vq_info;
1170bd10b9fSdv 
118f3c0184aSmlarkin 	/* Invalid queue? */
119169cf2a2Smlarkin 	if (viornd.cfg.queue_select > 0) {
120169cf2a2Smlarkin 		viornd.cfg.queue_size = 0;
121f3c0184aSmlarkin 		return;
122169cf2a2Smlarkin 	}
123f3c0184aSmlarkin 
1240bd10b9fSdv 	vq_info = &viornd.vq[viornd.cfg.queue_select];
1250bd10b9fSdv 
1260bd10b9fSdv 	/* Update queue pfn/size based on queue select */
1270bd10b9fSdv 	viornd.cfg.queue_pfn = vq_info->q_gpa >> 12;
1280bd10b9fSdv 	viornd.cfg.queue_size = vq_info->qs;
129f3c0184aSmlarkin }
130f3c0184aSmlarkin 
131f3c0184aSmlarkin /* Update queue address */
132f3c0184aSmlarkin void
133f3c0184aSmlarkin viornd_update_qa(void)
134f3c0184aSmlarkin {
1350bd10b9fSdv 	struct virtio_vq_info *vq_info;
1360bd10b9fSdv 	void *hva = NULL;
1370bd10b9fSdv 
138f3c0184aSmlarkin 	/* Invalid queue? */
139f3c0184aSmlarkin 	if (viornd.cfg.queue_select > 0)
140f3c0184aSmlarkin 		return;
141f3c0184aSmlarkin 
1420bd10b9fSdv 	vq_info = &viornd.vq[viornd.cfg.queue_select];
1430bd10b9fSdv 	vq_info->q_gpa = (uint64_t)viornd.cfg.queue_pfn * VIRTIO_PAGE_SIZE;
1440bd10b9fSdv 
1450bd10b9fSdv 	hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIORND_QUEUE_SIZE));
1460bd10b9fSdv 	if (hva == NULL)
1473481ecdfSdv 		fatalx("viornd_update_qa");
1480bd10b9fSdv 	vq_info->q_hva = hva;
149f3c0184aSmlarkin }
150f3c0184aSmlarkin 
151f3c0184aSmlarkin int
152f3c0184aSmlarkin viornd_notifyq(void)
153f3c0184aSmlarkin {
15492e4f4b6Smlarkin 	size_t sz;
155321f3ee3Sdv 	int dxx, ret;
156207349e7Smlarkin 	uint16_t aidx, uidx;
1570bd10b9fSdv 	char *vr, *rnd_data;
158f3c0184aSmlarkin 	struct vring_desc *desc;
159f3c0184aSmlarkin 	struct vring_avail *avail;
160f3c0184aSmlarkin 	struct vring_used *used;
1610bd10b9fSdv 	struct virtio_vq_info *vq_info;
162f3c0184aSmlarkin 
163f3c0184aSmlarkin 	ret = 0;
164f3c0184aSmlarkin 
165f3c0184aSmlarkin 	/* Invalid queue? */
166f3c0184aSmlarkin 	if (viornd.cfg.queue_notify > 0)
167f3c0184aSmlarkin 		return (0);
168f3c0184aSmlarkin 
1690bd10b9fSdv 	vq_info = &viornd.vq[viornd.cfg.queue_notify];
1700bd10b9fSdv 	vr = vq_info->q_hva;
1710bd10b9fSdv 	if (vr == NULL)
1720bd10b9fSdv 		fatalx("%s: null vring", __func__);
173f3c0184aSmlarkin 
1740bd10b9fSdv 	desc = (struct vring_desc *)(vr);
1750bd10b9fSdv 	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
1760bd10b9fSdv 	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
177f3c0184aSmlarkin 
178207349e7Smlarkin 	aidx = avail->idx & VIORND_QUEUE_MASK;
179207349e7Smlarkin 	uidx = used->idx & VIORND_QUEUE_MASK;
180207349e7Smlarkin 
181321f3ee3Sdv 	dxx = avail->ring[aidx] & VIORND_QUEUE_MASK;
182321f3ee3Sdv 
183321f3ee3Sdv 	sz = desc[dxx].len;
18492e4f4b6Smlarkin 	if (sz > MAXPHYS)
185321f3ee3Sdv 		fatalx("viornd descriptor size too large (%zu)", sz);
18692e4f4b6Smlarkin 
18792e4f4b6Smlarkin 	rnd_data = malloc(sz);
188f3c0184aSmlarkin 
189f3c0184aSmlarkin 	if (rnd_data != NULL) {
190321f3ee3Sdv 		arc4random_buf(rnd_data, sz);
191321f3ee3Sdv 		if (write_mem(desc[dxx].addr, rnd_data, sz)) {
1927da934edSreyk 			log_warnx("viornd: can't write random data @ "
1937da934edSreyk 			    "0x%llx",
194321f3ee3Sdv 			    desc[dxx].addr);
195f3c0184aSmlarkin 		} else {
196f3c0184aSmlarkin 			/* ret == 1 -> interrupt needed */
197f3c0184aSmlarkin 			/* XXX check VIRTIO_F_NO_INTR */
198f3c0184aSmlarkin 			ret = 1;
199f3c0184aSmlarkin 			viornd.cfg.isr_status = 1;
200321f3ee3Sdv 			used->ring[uidx].id = dxx;
201321f3ee3Sdv 			used->ring[uidx].len = sz;
2020bd10b9fSdv 			__sync_synchronize();
203f3c0184aSmlarkin 			used->idx++;
204f3c0184aSmlarkin 		}
205f3c0184aSmlarkin 		free(rnd_data);
20692e4f4b6Smlarkin 	} else
20792e4f4b6Smlarkin 		fatal("memory allocation error for viornd data");
208f3c0184aSmlarkin 
209f3c0184aSmlarkin 	return (ret);
210f3c0184aSmlarkin }
211f3c0184aSmlarkin 
212f3c0184aSmlarkin int
213f3c0184aSmlarkin virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
214eef1411cSmlarkin     void *unused, uint8_t sz)
215f3c0184aSmlarkin {
216f3c0184aSmlarkin 	*intr = 0xFF;
217f3c0184aSmlarkin 
218f3c0184aSmlarkin 	if (dir == 0) {
219f3c0184aSmlarkin 		switch (reg) {
220f3c0184aSmlarkin 		case VIRTIO_CONFIG_DEVICE_FEATURES:
221f3c0184aSmlarkin 		case VIRTIO_CONFIG_QUEUE_SIZE:
222f3c0184aSmlarkin 		case VIRTIO_CONFIG_ISR_STATUS:
2237da934edSreyk 			log_warnx("%s: illegal write %x to %s",
224f3c0184aSmlarkin 			    __progname, *data, virtio_reg_name(reg));
225f3c0184aSmlarkin 			break;
226f3c0184aSmlarkin 		case VIRTIO_CONFIG_GUEST_FEATURES:
227f3c0184aSmlarkin 			viornd.cfg.guest_feature = *data;
228f3c0184aSmlarkin 			break;
2290bd10b9fSdv 		case VIRTIO_CONFIG_QUEUE_PFN:
2300bd10b9fSdv 			viornd.cfg.queue_pfn = *data;
231f3c0184aSmlarkin 			viornd_update_qa();
232f3c0184aSmlarkin 			break;
233f3c0184aSmlarkin 		case VIRTIO_CONFIG_QUEUE_SELECT:
234f3c0184aSmlarkin 			viornd.cfg.queue_select = *data;
235f3c0184aSmlarkin 			viornd_update_qs();
236f3c0184aSmlarkin 			break;
237f3c0184aSmlarkin 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
238f3c0184aSmlarkin 			viornd.cfg.queue_notify = *data;
239f3c0184aSmlarkin 			if (viornd_notifyq())
240f3c0184aSmlarkin 				*intr = 1;
241f3c0184aSmlarkin 			break;
242f3c0184aSmlarkin 		case VIRTIO_CONFIG_DEVICE_STATUS:
243f3c0184aSmlarkin 			viornd.cfg.device_status = *data;
244f3c0184aSmlarkin 			break;
245f3c0184aSmlarkin 		}
246f3c0184aSmlarkin 	} else {
247f3c0184aSmlarkin 		switch (reg) {
248f3c0184aSmlarkin 		case VIRTIO_CONFIG_DEVICE_FEATURES:
249f3c0184aSmlarkin 			*data = viornd.cfg.device_feature;
250f3c0184aSmlarkin 			break;
251f3c0184aSmlarkin 		case VIRTIO_CONFIG_GUEST_FEATURES:
252f3c0184aSmlarkin 			*data = viornd.cfg.guest_feature;
253f3c0184aSmlarkin 			break;
2540bd10b9fSdv 		case VIRTIO_CONFIG_QUEUE_PFN:
2550bd10b9fSdv 			*data = viornd.cfg.queue_pfn;
256f3c0184aSmlarkin 			break;
257f3c0184aSmlarkin 		case VIRTIO_CONFIG_QUEUE_SIZE:
258f3c0184aSmlarkin 			*data = viornd.cfg.queue_size;
259f3c0184aSmlarkin 			break;
260f3c0184aSmlarkin 		case VIRTIO_CONFIG_QUEUE_SELECT:
261f3c0184aSmlarkin 			*data = viornd.cfg.queue_select;
262f3c0184aSmlarkin 			break;
263f3c0184aSmlarkin 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
264f3c0184aSmlarkin 			*data = viornd.cfg.queue_notify;
265f3c0184aSmlarkin 			break;
266f3c0184aSmlarkin 		case VIRTIO_CONFIG_DEVICE_STATUS:
267f3c0184aSmlarkin 			*data = viornd.cfg.device_status;
268f3c0184aSmlarkin 			break;
269f3c0184aSmlarkin 		case VIRTIO_CONFIG_ISR_STATUS:
270f3c0184aSmlarkin 			*data = viornd.cfg.isr_status;
27107520ef5Smlarkin 			viornd.cfg.isr_status = 0;
272c4fd4c5bSdv 			vcpu_deassert_irq(viornd.vm_id, 0, viornd.irq);
273f3c0184aSmlarkin 			break;
274f3c0184aSmlarkin 		}
275f3c0184aSmlarkin 	}
276f3c0184aSmlarkin 	return (0);
277f3c0184aSmlarkin }
278f3c0184aSmlarkin 
279*7ccb23ddSdv /*
280*7ccb23ddSdv  * vmmci_ctl
281*7ccb23ddSdv  *
282*7ccb23ddSdv  * Inject a command into the vmmci device, potentially delivering interrupt.
283*7ccb23ddSdv  *
284*7ccb23ddSdv  * Called by the vm process's event(3) loop.
285*7ccb23ddSdv  */
286f84d5d33Sreyk int
287f84d5d33Sreyk vmmci_ctl(unsigned int cmd)
288f84d5d33Sreyk {
289*7ccb23ddSdv 	int ret = 0;
2903320a88dSreyk 	struct timeval tv = { 0, 0 };
2913320a88dSreyk 
292*7ccb23ddSdv 	mutex_lock(&vmmci.mutex);
293*7ccb23ddSdv 
294f84d5d33Sreyk 	if ((vmmci.cfg.device_status &
295*7ccb23ddSdv 	    VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0) {
296*7ccb23ddSdv 		ret = -1;
297*7ccb23ddSdv 		goto unlock;
298*7ccb23ddSdv 	}
299f84d5d33Sreyk 
300f84d5d33Sreyk 	if (cmd == vmmci.cmd)
301*7ccb23ddSdv 		goto unlock;
302f84d5d33Sreyk 
303f84d5d33Sreyk 	switch (cmd) {
304f84d5d33Sreyk 	case VMMCI_NONE:
3053320a88dSreyk 		break;
306f84d5d33Sreyk 	case VMMCI_SHUTDOWN:
307f84d5d33Sreyk 	case VMMCI_REBOOT:
308f84d5d33Sreyk 		/* Update command */
309f84d5d33Sreyk 		vmmci.cmd = cmd;
310f84d5d33Sreyk 
311a4ba3913Sreyk 		/*
312a4ba3913Sreyk 		 * vmm VMs do not support powerdown, send a reboot request
313a4ba3913Sreyk 		 * instead and turn it off after the triple fault.
314a4ba3913Sreyk 		 */
315a4ba3913Sreyk 		if (cmd == VMMCI_SHUTDOWN)
316a4ba3913Sreyk 			cmd = VMMCI_REBOOT;
317a4ba3913Sreyk 
318f84d5d33Sreyk 		/* Trigger interrupt */
319f84d5d33Sreyk 		vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
320c4fd4c5bSdv 		vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq);
3213320a88dSreyk 
3223320a88dSreyk 		/* Add ACK timeout */
323*7ccb23ddSdv 		tv.tv_sec = VMMCI_TIMEOUT_SHORT;
3243320a88dSreyk 		evtimer_add(&vmmci.timeout, &tv);
325f84d5d33Sreyk 		break;
326e82d5294Smlarkin 	case VMMCI_SYNCRTC:
327e82d5294Smlarkin 		if (vmmci.cfg.guest_feature & VMMCI_F_SYNCRTC) {
328e82d5294Smlarkin 			/* RTC updated, request guest VM resync of its RTC */
329e82d5294Smlarkin 			vmmci.cmd = cmd;
330e82d5294Smlarkin 
331e82d5294Smlarkin 			vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
332c4fd4c5bSdv 			vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq);
333e82d5294Smlarkin 		} else {
334e82d5294Smlarkin 			log_debug("%s: RTC sync skipped (guest does not "
335e82d5294Smlarkin 			    "support RTC sync)\n", __func__);
336e82d5294Smlarkin 		}
337e82d5294Smlarkin 		break;
338f84d5d33Sreyk 	default:
339f84d5d33Sreyk 		fatalx("invalid vmmci command: %d", cmd);
340f84d5d33Sreyk 	}
341f84d5d33Sreyk 
342*7ccb23ddSdv unlock:
343*7ccb23ddSdv 	mutex_unlock(&vmmci.mutex);
344*7ccb23ddSdv 
345*7ccb23ddSdv 	return (ret);
346f84d5d33Sreyk }
347f84d5d33Sreyk 
348*7ccb23ddSdv /*
349*7ccb23ddSdv  * vmmci_ack
350*7ccb23ddSdv  *
351*7ccb23ddSdv  * Process a write to the command register.
352*7ccb23ddSdv  *
353*7ccb23ddSdv  * Called by the vcpu thread. Must be called with the mutex held.
354*7ccb23ddSdv  */
3553320a88dSreyk void
3563320a88dSreyk vmmci_ack(unsigned int cmd)
3573320a88dSreyk {
3583320a88dSreyk 	switch (cmd) {
3593320a88dSreyk 	case VMMCI_NONE:
3603320a88dSreyk 		break;
3613320a88dSreyk 	case VMMCI_SHUTDOWN:
3623320a88dSreyk 		/*
3633320a88dSreyk 		 * The shutdown was requested by the VM if we don't have
3643320a88dSreyk 		 * a pending shutdown request.  In this case add a short
3653320a88dSreyk 		 * timeout to give the VM a chance to reboot before the
3663320a88dSreyk 		 * timer is expired.
3673320a88dSreyk 		 */
3683320a88dSreyk 		if (vmmci.cmd == 0) {
3693320a88dSreyk 			log_debug("%s: vm %u requested shutdown", __func__,
3703320a88dSreyk 			    vmmci.vm_id);
371*7ccb23ddSdv 			vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_SHORT);
3723320a88dSreyk 			return;
3733320a88dSreyk 		}
3743320a88dSreyk 		/* FALLTHROUGH */
3753320a88dSreyk 	case VMMCI_REBOOT:
3763320a88dSreyk 		/*
3773a50f0a9Sjmc 		 * If the VM acknowledged our shutdown request, give it
3783320a88dSreyk 		 * enough time to shutdown or reboot gracefully.  This
3793320a88dSreyk 		 * might take a considerable amount of time (running
3803320a88dSreyk 		 * rc.shutdown on the VM), so increase the timeout before
3813320a88dSreyk 		 * killing it forcefully.
3823320a88dSreyk 		 */
383*7ccb23ddSdv 		if (cmd == vmmci.cmd) {
3843320a88dSreyk 			log_debug("%s: vm %u acknowledged shutdown request",
3853320a88dSreyk 			    __func__, vmmci.vm_id);
386*7ccb23ddSdv 			vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_LONG);
3873320a88dSreyk 		}
3883320a88dSreyk 		break;
389e82d5294Smlarkin 	case VMMCI_SYNCRTC:
390e82d5294Smlarkin 		log_debug("%s: vm %u acknowledged RTC sync request",
391e82d5294Smlarkin 		    __func__, vmmci.vm_id);
392e82d5294Smlarkin 		vmmci.cmd = VMMCI_NONE;
393e82d5294Smlarkin 		break;
3943320a88dSreyk 	default:
3953320a88dSreyk 		log_warnx("%s: illegal request %u", __func__, cmd);
3963320a88dSreyk 		break;
3973320a88dSreyk 	}
3983320a88dSreyk }
3993320a88dSreyk 
4003320a88dSreyk void
4013320a88dSreyk vmmci_timeout(int fd, short type, void *arg)
4023320a88dSreyk {
4033320a88dSreyk 	log_debug("%s: vm %u shutdown", __progname, vmmci.vm_id);
4043320a88dSreyk 	vm_shutdown(vmmci.cmd == VMMCI_REBOOT ? VMMCI_REBOOT : VMMCI_SHUTDOWN);
4053320a88dSreyk }
4063320a88dSreyk 
407f84d5d33Sreyk int
408f84d5d33Sreyk vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
409eef1411cSmlarkin     void *unused, uint8_t sz)
410f84d5d33Sreyk {
411f84d5d33Sreyk 	*intr = 0xFF;
412f84d5d33Sreyk 
413*7ccb23ddSdv 	mutex_lock(&vmmci.mutex);
414f84d5d33Sreyk 	if (dir == 0) {
415f84d5d33Sreyk 		switch (reg) {
416f84d5d33Sreyk 		case VIRTIO_CONFIG_DEVICE_FEATURES:
417f84d5d33Sreyk 		case VIRTIO_CONFIG_QUEUE_SIZE:
418f84d5d33Sreyk 		case VIRTIO_CONFIG_ISR_STATUS:
419f84d5d33Sreyk 			log_warnx("%s: illegal write %x to %s",
420f84d5d33Sreyk 			    __progname, *data, virtio_reg_name(reg));
421f84d5d33Sreyk 			break;
422f84d5d33Sreyk 		case VIRTIO_CONFIG_GUEST_FEATURES:
423f84d5d33Sreyk 			vmmci.cfg.guest_feature = *data;
424f84d5d33Sreyk 			break;
4250bd10b9fSdv 		case VIRTIO_CONFIG_QUEUE_PFN:
4260bd10b9fSdv 			vmmci.cfg.queue_pfn = *data;
427f84d5d33Sreyk 			break;
428f84d5d33Sreyk 		case VIRTIO_CONFIG_QUEUE_SELECT:
429f84d5d33Sreyk 			vmmci.cfg.queue_select = *data;
430f84d5d33Sreyk 			break;
431f84d5d33Sreyk 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
432f84d5d33Sreyk 			vmmci.cfg.queue_notify = *data;
433f84d5d33Sreyk 			break;
434f84d5d33Sreyk 		case VIRTIO_CONFIG_DEVICE_STATUS:
435f84d5d33Sreyk 			vmmci.cfg.device_status = *data;
436f84d5d33Sreyk 			break;
4373320a88dSreyk 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
4383320a88dSreyk 			vmmci_ack(*data);
4393320a88dSreyk 			break;
440f84d5d33Sreyk 		}
441f84d5d33Sreyk 	} else {
442f84d5d33Sreyk 		switch (reg) {
443f84d5d33Sreyk 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
444f84d5d33Sreyk 			*data = vmmci.cmd;
445f84d5d33Sreyk 			break;
446981cad08Sreyk 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
447981cad08Sreyk 			/* Update time once when reading the first register */
448981cad08Sreyk 			gettimeofday(&vmmci.time, NULL);
449981cad08Sreyk 			*data = (uint64_t)vmmci.time.tv_sec;
450981cad08Sreyk 			break;
451981cad08Sreyk 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
452981cad08Sreyk 			*data = (uint64_t)vmmci.time.tv_sec << 32;
453981cad08Sreyk 			break;
454981cad08Sreyk 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12:
455981cad08Sreyk 			*data = (uint64_t)vmmci.time.tv_usec;
456981cad08Sreyk 			break;
457981cad08Sreyk 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16:
458981cad08Sreyk 			*data = (uint64_t)vmmci.time.tv_usec << 32;
459981cad08Sreyk 			break;
460f84d5d33Sreyk 		case VIRTIO_CONFIG_DEVICE_FEATURES:
461f84d5d33Sreyk 			*data = vmmci.cfg.device_feature;
462f84d5d33Sreyk 			break;
463f84d5d33Sreyk 		case VIRTIO_CONFIG_GUEST_FEATURES:
464f84d5d33Sreyk 			*data = vmmci.cfg.guest_feature;
465f84d5d33Sreyk 			break;
4660bd10b9fSdv 		case VIRTIO_CONFIG_QUEUE_PFN:
4670bd10b9fSdv 			*data = vmmci.cfg.queue_pfn;
468f84d5d33Sreyk 			break;
469f84d5d33Sreyk 		case VIRTIO_CONFIG_QUEUE_SIZE:
470f84d5d33Sreyk 			*data = vmmci.cfg.queue_size;
471f84d5d33Sreyk 			break;
472f84d5d33Sreyk 		case VIRTIO_CONFIG_QUEUE_SELECT:
473f84d5d33Sreyk 			*data = vmmci.cfg.queue_select;
474f84d5d33Sreyk 			break;
475f84d5d33Sreyk 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
476f84d5d33Sreyk 			*data = vmmci.cfg.queue_notify;
477f84d5d33Sreyk 			break;
478f84d5d33Sreyk 		case VIRTIO_CONFIG_DEVICE_STATUS:
479f84d5d33Sreyk 			*data = vmmci.cfg.device_status;
480f84d5d33Sreyk 			break;
481f84d5d33Sreyk 		case VIRTIO_CONFIG_ISR_STATUS:
482f84d5d33Sreyk 			*data = vmmci.cfg.isr_status;
48307520ef5Smlarkin 			vmmci.cfg.isr_status = 0;
484c4fd4c5bSdv 			vcpu_deassert_irq(vmmci.vm_id, 0, vmmci.irq);
485f84d5d33Sreyk 			break;
486f84d5d33Sreyk 		}
487f84d5d33Sreyk 	}
488*7ccb23ddSdv 	mutex_unlock(&vmmci.mutex);
489*7ccb23ddSdv 
490f84d5d33Sreyk 	return (0);
491f84d5d33Sreyk }
492f84d5d33Sreyk 
49373613953Sreyk int
4944d2a1fb2Sreyk virtio_get_base(int fd, char *path, size_t npath, int type, const char *dpath)
49573613953Sreyk {
49673613953Sreyk 	switch (type) {
49773613953Sreyk 	case VMDF_RAW:
49873613953Sreyk 		return 0;
49973613953Sreyk 	case VMDF_QCOW2:
5004d2a1fb2Sreyk 		return virtio_qcow2_get_base(fd, path, npath, dpath);
50173613953Sreyk 	}
50273613953Sreyk 	log_warnx("%s: invalid disk format", __func__);
50373613953Sreyk 	return -1;
50473613953Sreyk }
50573613953Sreyk 
506*7ccb23ddSdv static void
507*7ccb23ddSdv vmmci_pipe_dispatch(int fd, short event, void *arg)
508*7ccb23ddSdv {
509*7ccb23ddSdv 	enum pipe_msg_type msg;
510*7ccb23ddSdv 	struct timeval tv = { 0, 0 };
511*7ccb23ddSdv 
512*7ccb23ddSdv 	msg = vm_pipe_recv(&vmmci.dev_pipe);
513*7ccb23ddSdv 	switch (msg) {
514*7ccb23ddSdv 	case VMMCI_SET_TIMEOUT_SHORT:
515*7ccb23ddSdv 		tv.tv_sec = VMMCI_TIMEOUT_SHORT;
516*7ccb23ddSdv 		evtimer_add(&vmmci.timeout, &tv);
517*7ccb23ddSdv 		break;
518*7ccb23ddSdv 	case VMMCI_SET_TIMEOUT_LONG:
519*7ccb23ddSdv 		tv.tv_sec = VMMCI_TIMEOUT_LONG;
520*7ccb23ddSdv 		evtimer_add(&vmmci.timeout, &tv);
521*7ccb23ddSdv 		break;
522*7ccb23ddSdv 	default:
523*7ccb23ddSdv 		log_warnx("%s: invalid pipe message type %d", __func__, msg);
524*7ccb23ddSdv 	}
525*7ccb23ddSdv }
526*7ccb23ddSdv 
527f3c0184aSmlarkin void
52873613953Sreyk virtio_init(struct vmd_vm *vm, int child_cdrom,
52973613953Sreyk     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
530f3c0184aSmlarkin {
531470adcf5Sreyk 	struct vmop_create_params *vmc = &vm->vm_params;
5322b2a5f0dSreyk 	struct vm_create_params *vcp = &vmc->vmc_params;
5333481ecdfSdv 	struct virtio_dev *dev;
534f3c0184aSmlarkin 	uint8_t id;
5353481ecdfSdv 	uint8_t i, j;
536*7ccb23ddSdv 	int ret = 0;
537f3c0184aSmlarkin 
538f3c0184aSmlarkin 	/* Virtio entropy device */
539f3c0184aSmlarkin 	if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
540f3c0184aSmlarkin 	    PCI_PRODUCT_QUMRANET_VIO_RNG, PCI_CLASS_SYSTEM,
541f3c0184aSmlarkin 	    PCI_SUBCLASS_SYSTEM_MISC,
542f3c0184aSmlarkin 	    PCI_VENDOR_OPENBSD,
543f3c0184aSmlarkin 	    PCI_PRODUCT_VIRTIO_ENTROPY, 1, NULL)) {
5447da934edSreyk 		log_warnx("%s: can't add PCI virtio rng device",
545f3c0184aSmlarkin 		    __progname);
546f3c0184aSmlarkin 		return;
547f3c0184aSmlarkin 	}
548f3c0184aSmlarkin 
549f3c0184aSmlarkin 	if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_rnd_io, NULL)) {
5507da934edSreyk 		log_warnx("%s: can't add bar for virtio rng device",
551f3c0184aSmlarkin 		    __progname);
552f3c0184aSmlarkin 		return;
553f3c0184aSmlarkin 	}
554f3c0184aSmlarkin 
555cb0fa87cSmlarkin 	memset(&viornd, 0, sizeof(viornd));
556f3c0184aSmlarkin 	viornd.vq[0].qs = VIORND_QUEUE_SIZE;
557f3c0184aSmlarkin 	viornd.vq[0].vq_availoffset = sizeof(struct vring_desc) *
558f3c0184aSmlarkin 	    VIORND_QUEUE_SIZE;
559f3c0184aSmlarkin 	viornd.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
560f3c0184aSmlarkin 	    sizeof(struct vring_desc) * VIORND_QUEUE_SIZE
561f3c0184aSmlarkin 	    + sizeof(uint16_t) * (2 + VIORND_QUEUE_SIZE));
562813e3047Spd 	viornd.pci_id = id;
5637b7c4b51Smlarkin 	viornd.irq = pci_get_dev_irq(id);
5647b36e131Smlarkin 	viornd.vm_id = vcp->vcp_id;
565f3c0184aSmlarkin 
5663481ecdfSdv 	SLIST_INIT(&virtio_devs);
5673481ecdfSdv 
56873a98491Sdv 	if (vmc->vmc_nnics > 0) {
5693481ecdfSdv 		for (i = 0; i < vmc->vmc_nnics; i++) {
5703481ecdfSdv 			dev = calloc(1, sizeof(struct virtio_dev));
5713481ecdfSdv 			if (dev == NULL) {
5723481ecdfSdv 				log_warn("%s: calloc failure allocating vionet",
573f3c0184aSmlarkin 				    __progname);
574f3c0184aSmlarkin 				return;
575f3c0184aSmlarkin 			}
576f3c0184aSmlarkin 			/* Virtio network */
5773481ecdfSdv 			dev->dev_type = VMD_DEVTYPE_NET;
5783481ecdfSdv 
579f3c0184aSmlarkin 			if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
580f3c0184aSmlarkin 				PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM,
5813481ecdfSdv 				PCI_SUBCLASS_SYSTEM_MISC, PCI_VENDOR_OPENBSD,
582f3c0184aSmlarkin 				PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL)) {
5837da934edSreyk 				log_warnx("%s: can't add PCI virtio net device",
584f3c0184aSmlarkin 				    __progname);
585f3c0184aSmlarkin 				return;
586f3c0184aSmlarkin 			}
5873481ecdfSdv 			dev->pci_id = id;
5883481ecdfSdv 			dev->sync_fd = -1;
5893481ecdfSdv 			dev->async_fd = -1;
5903481ecdfSdv 			dev->vm_id = vcp->vcp_id;
5913481ecdfSdv 			dev->vm_vmid = vm->vm_vmid;
5923481ecdfSdv 			dev->irq = pci_get_dev_irq(id);
593f3c0184aSmlarkin 
5943481ecdfSdv 			/* The vionet pci bar function is called by the vcpu. */
5953481ecdfSdv 			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
5963481ecdfSdv 			    dev)) {
5977da934edSreyk 				log_warnx("%s: can't add bar for virtio net "
5987da934edSreyk 				    "device", __progname);
599f3c0184aSmlarkin 				return;
600f3c0184aSmlarkin 			}
601f3c0184aSmlarkin 
6023481ecdfSdv 			dev->vionet.vq[RXQ].qs = VIONET_QUEUE_SIZE;
6033481ecdfSdv 			dev->vionet.vq[RXQ].vq_availoffset =
6042fee2047Smlarkin 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
6053481ecdfSdv 			dev->vionet.vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
606f3c0184aSmlarkin 				sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
607f3c0184aSmlarkin 				+ sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
6083481ecdfSdv 			dev->vionet.vq[RXQ].last_avail = 0;
6093481ecdfSdv 			dev->vionet.vq[RXQ].notified_avail = 0;
6106c31e103Sdv 
6113481ecdfSdv 			dev->vionet.vq[TXQ].qs = VIONET_QUEUE_SIZE;
6123481ecdfSdv 			dev->vionet.vq[TXQ].vq_availoffset =
6132fee2047Smlarkin 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
6143481ecdfSdv 			dev->vionet.vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
615f3c0184aSmlarkin 				sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
616f3c0184aSmlarkin 				+ sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
6173481ecdfSdv 			dev->vionet.vq[TXQ].last_avail = 0;
6183481ecdfSdv 			dev->vionet.vq[TXQ].notified_avail = 0;
619c396b7b1Sstefan 
6203481ecdfSdv 			dev->vionet.data_fd = child_taps[i];
621f3c0184aSmlarkin 
6223a0c3a66Sreyk 			/* MAC address has been assigned by the parent */
6233481ecdfSdv 			memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6);
6243481ecdfSdv 			dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC;
62547ccdcefSreyk 
6263481ecdfSdv 			dev->vionet.lockedmac =
6272b2a5f0dSreyk 			    vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0;
6283481ecdfSdv 			dev->vionet.local =
629470adcf5Sreyk 			    vmc->vmc_ifflags[i] & VMIFF_LOCAL ? 1 : 0;
630cc104512Sclaudio 			if (i == 0 && vmc->vmc_bootdevice & VMBOOTDEV_NET)
6313481ecdfSdv 				dev->vionet.pxeboot = 1;
6322272e586Sdv 			memcpy(&dev->vionet.local_prefix,
6332272e586Sdv 			    &env->vmd_cfg.cfg_localprefix,
6342272e586Sdv 			    sizeof(dev->vionet.local_prefix));
635cc104512Sclaudio 			log_debug("%s: vm \"%s\" vio%u lladdr %s%s%s%s",
6362fca0731Sreyk 			    __func__, vcp->vcp_name, i,
6373481ecdfSdv 			    ether_ntoa((void *)dev->vionet.mac),
6383481ecdfSdv 			    dev->vionet.lockedmac ? ", locked" : "",
6393481ecdfSdv 			    dev->vionet.local ? ", local" : "",
6403481ecdfSdv 			    dev->vionet.pxeboot ? ", pxeboot" : "");
6413481ecdfSdv 
6423481ecdfSdv 			/* Add the vionet to our device list. */
6433481ecdfSdv 			dev->vionet.idx = i;
6443481ecdfSdv 			SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
645f3c0184aSmlarkin 		}
646f3c0184aSmlarkin 	}
647f84d5d33Sreyk 
64873a98491Sdv 	if (vmc->vmc_ndisks > 0) {
6493481ecdfSdv 		for (i = 0; i < vmc->vmc_ndisks; i++) {
6503481ecdfSdv 			dev = calloc(1, sizeof(struct virtio_dev));
6513481ecdfSdv 			if (dev == NULL) {
6523481ecdfSdv 				log_warn("%s: calloc failure allocating vioblk",
65393622bc9Smlarkin 				    __progname);
65493622bc9Smlarkin 				return;
65593622bc9Smlarkin 			}
65693622bc9Smlarkin 
6573481ecdfSdv 			/* One vioblk device for each disk defined in vcp */
6583481ecdfSdv 			dev->dev_type = VMD_DEVTYPE_DISK;
6593481ecdfSdv 
66093622bc9Smlarkin 			if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
66193622bc9Smlarkin 			    PCI_PRODUCT_QUMRANET_VIO_BLOCK,
66293622bc9Smlarkin 			    PCI_CLASS_MASS_STORAGE,
66393622bc9Smlarkin 			    PCI_SUBCLASS_MASS_STORAGE_SCSI,
66493622bc9Smlarkin 			    PCI_VENDOR_OPENBSD,
66593622bc9Smlarkin 			    PCI_PRODUCT_VIRTIO_BLOCK, 1, NULL)) {
66693622bc9Smlarkin 				log_warnx("%s: can't add PCI virtio block "
66793622bc9Smlarkin 				    "device", __progname);
66893622bc9Smlarkin 				return;
66993622bc9Smlarkin 			}
6703481ecdfSdv 			dev->pci_id = id;
6713481ecdfSdv 			dev->sync_fd = -1;
6723481ecdfSdv 			dev->async_fd = -1;
6733481ecdfSdv 			dev->vm_id = vcp->vcp_id;
6743481ecdfSdv 			dev->vm_vmid = vm->vm_vmid;
6753481ecdfSdv 			dev->irq = pci_get_dev_irq(id);
6763481ecdfSdv 
6773481ecdfSdv 			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
6783481ecdfSdv 			    &dev->vioblk)) {
67993622bc9Smlarkin 				log_warnx("%s: can't add bar for virtio block "
68093622bc9Smlarkin 				    "device", __progname);
68193622bc9Smlarkin 				return;
68293622bc9Smlarkin 			}
6833481ecdfSdv 			dev->vioblk.vq[0].qs = VIOBLK_QUEUE_SIZE;
6843481ecdfSdv 			dev->vioblk.vq[0].vq_availoffset =
68593622bc9Smlarkin 			    sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE;
6863481ecdfSdv 			dev->vioblk.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
68793622bc9Smlarkin 			    sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE
68893622bc9Smlarkin 			    + sizeof(uint16_t) * (2 + VIOBLK_QUEUE_SIZE));
6893481ecdfSdv 			dev->vioblk.vq[0].last_avail = 0;
6903481ecdfSdv 			dev->vioblk.cfg.device_feature =
69120e554f8Sdv 			    VIRTIO_BLK_F_SEG_MAX;
69220e554f8Sdv 			dev->vioblk.seg_max = VIOBLK_SEG_MAX;
6933481ecdfSdv 
6943481ecdfSdv 			/*
6953481ecdfSdv 			 * Initialize disk fds to an invalid fd (-1), then
6963481ecdfSdv 			 * set any child disk fds.
6973481ecdfSdv 			 */
6983481ecdfSdv 			memset(&dev->vioblk.disk_fd, -1,
6993481ecdfSdv 			    sizeof(dev->vioblk.disk_fd));
7003481ecdfSdv 			dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i];
7013481ecdfSdv 			for (j = 0; j < dev->vioblk.ndisk_fd; j++)
7023481ecdfSdv 				dev->vioblk.disk_fd[j] = child_disks[i][j];
7033481ecdfSdv 
7043481ecdfSdv 			dev->vioblk.idx = i;
7053481ecdfSdv 			SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
70693622bc9Smlarkin 		}
70793622bc9Smlarkin 	}
7083481ecdfSdv 
7093481ecdfSdv 	/*
7103481ecdfSdv 	 * Launch virtio devices that support subprocess execution.
7113481ecdfSdv 	 */
7123481ecdfSdv 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
7133481ecdfSdv 		if (virtio_dev_launch(vm, dev) != 0)
7143481ecdfSdv 			fatalx("failed to launch virtio device");
71593622bc9Smlarkin 	}
71693622bc9Smlarkin 
71795ab188fSccardenas 	/* vioscsi cdrom */
71873a98491Sdv 	if (strlen(vmc->vmc_cdrom)) {
71995ab188fSccardenas 		vioscsi = calloc(1, sizeof(struct vioscsi_dev));
72095ab188fSccardenas 		if (vioscsi == NULL) {
72195ab188fSccardenas 			log_warn("%s: calloc failure allocating vioscsi",
72295ab188fSccardenas 			    __progname);
72395ab188fSccardenas 			return;
72495ab188fSccardenas 		}
72595ab188fSccardenas 
72695ab188fSccardenas 		if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
72795ab188fSccardenas 		    PCI_PRODUCT_QUMRANET_VIO_SCSI,
72895ab188fSccardenas 		    PCI_CLASS_MASS_STORAGE,
72995ab188fSccardenas 		    PCI_SUBCLASS_MASS_STORAGE_SCSI,
73095ab188fSccardenas 		    PCI_VENDOR_OPENBSD,
73195ab188fSccardenas 		    PCI_PRODUCT_VIRTIO_SCSI, 1, NULL)) {
73295ab188fSccardenas 			log_warnx("%s: can't add PCI vioscsi device",
73395ab188fSccardenas 			    __progname);
73495ab188fSccardenas 			return;
73595ab188fSccardenas 		}
73695ab188fSccardenas 
73795ab188fSccardenas 		if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vioscsi_io, vioscsi)) {
73895ab188fSccardenas 			log_warnx("%s: can't add bar for vioscsi device",
73995ab188fSccardenas 			    __progname);
74095ab188fSccardenas 			return;
74195ab188fSccardenas 		}
74295ab188fSccardenas 
74395ab188fSccardenas 		for (i = 0; i < VIRTIO_MAX_QUEUES; i++) {
74495ab188fSccardenas 			vioscsi->vq[i].qs = VIOSCSI_QUEUE_SIZE;
74595ab188fSccardenas 			vioscsi->vq[i].vq_availoffset =
74695ab188fSccardenas 			    sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE;
74795ab188fSccardenas 			vioscsi->vq[i].vq_usedoffset = VIRTQUEUE_ALIGN(
74895ab188fSccardenas 			    sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE
74995ab188fSccardenas 			    + sizeof(uint16_t) * (2 + VIOSCSI_QUEUE_SIZE));
75095ab188fSccardenas 			vioscsi->vq[i].last_avail = 0;
75195ab188fSccardenas 		}
7523481ecdfSdv 		if (virtio_raw_init(&vioscsi->file, &vioscsi->sz, &child_cdrom,
7533481ecdfSdv 		    1) == -1) {
7540e71c2c7Sccardenas 			log_warnx("%s: unable to determine iso format",
7550e71c2c7Sccardenas 			    __func__);
7569617633bSccardenas 			return;
7572919bad8Sccardenas 		}
75895ab188fSccardenas 		vioscsi->locked = 0;
75995ab188fSccardenas 		vioscsi->lba = 0;
7602495c0aeSdv 		vioscsi->n_blocks = vioscsi->sz / VIOSCSI_BLOCK_SIZE_CDROM;
76195ab188fSccardenas 		vioscsi->max_xfer = VIOSCSI_BLOCK_SIZE_CDROM;
76295ab188fSccardenas 		vioscsi->pci_id = id;
7637b36e131Smlarkin 		vioscsi->vm_id = vcp->vcp_id;
7647b7c4b51Smlarkin 		vioscsi->irq = pci_get_dev_irq(id);
76595ab188fSccardenas 	}
76695ab188fSccardenas 
767f84d5d33Sreyk 	/* virtio control device */
768f84d5d33Sreyk 	if (pci_add_device(&id, PCI_VENDOR_OPENBSD,
769f84d5d33Sreyk 	    PCI_PRODUCT_OPENBSD_CONTROL,
770f84d5d33Sreyk 	    PCI_CLASS_COMMUNICATIONS,
771f84d5d33Sreyk 	    PCI_SUBCLASS_COMMUNICATIONS_MISC,
772f84d5d33Sreyk 	    PCI_VENDOR_OPENBSD,
773f84d5d33Sreyk 	    PCI_PRODUCT_VIRTIO_VMMCI, 1, NULL)) {
774f84d5d33Sreyk 		log_warnx("%s: can't add PCI vmm control device",
775f84d5d33Sreyk 		    __progname);
776f84d5d33Sreyk 		return;
777f84d5d33Sreyk 	}
778f84d5d33Sreyk 
779f84d5d33Sreyk 	if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vmmci_io, NULL)) {
780f84d5d33Sreyk 		log_warnx("%s: can't add bar for vmm control device",
781f84d5d33Sreyk 		    __progname);
782f84d5d33Sreyk 		return;
783f84d5d33Sreyk 	}
784f84d5d33Sreyk 
7853320a88dSreyk 	memset(&vmmci, 0, sizeof(vmmci));
786e82d5294Smlarkin 	vmmci.cfg.device_feature = VMMCI_F_TIMESYNC | VMMCI_F_ACK |
787e82d5294Smlarkin 	    VMMCI_F_SYNCRTC;
788f84d5d33Sreyk 	vmmci.vm_id = vcp->vcp_id;
789f84d5d33Sreyk 	vmmci.irq = pci_get_dev_irq(id);
790813e3047Spd 	vmmci.pci_id = id;
791*7ccb23ddSdv 	ret = pthread_mutex_init(&vmmci.mutex, NULL);
792*7ccb23ddSdv 	if (ret) {
793*7ccb23ddSdv 		errno = ret;
794*7ccb23ddSdv 		fatal("could not initialize vmmci mutex");
795*7ccb23ddSdv 	}
7963320a88dSreyk 
7973320a88dSreyk 	evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
798*7ccb23ddSdv 	vm_pipe_init(&vmmci.dev_pipe, vmmci_pipe_dispatch);
799*7ccb23ddSdv 	event_add(&vmmci.dev_pipe.read_ev, NULL);
8002fee2047Smlarkin }
801149417b6Sreyk 
80297f33f1dSdv /*
80397f33f1dSdv  * vionet_set_hostmac
80497f33f1dSdv  *
80597f33f1dSdv  * Sets the hardware address for the host-side tap(4) on a vionet_dev.
80697f33f1dSdv  *
80797f33f1dSdv  * This should only be called from the event-loop thread
80897f33f1dSdv  *
80997f33f1dSdv  * vm: pointer to the current vmd_vm instance
81097f33f1dSdv  * idx: index into the array of vionet_dev's for the target vionet_dev
81197f33f1dSdv  * addr: ethernet address to set
81297f33f1dSdv  */
81397f33f1dSdv void
81497f33f1dSdv vionet_set_hostmac(struct vmd_vm *vm, unsigned int idx, uint8_t *addr)
81597f33f1dSdv {
81697f33f1dSdv 	struct vmop_create_params	*vmc = &vm->vm_params;
8173481ecdfSdv 	struct virtio_dev		*dev;
8183481ecdfSdv 	struct vionet_dev		*vionet = NULL;
8193481ecdfSdv 	int ret;
82097f33f1dSdv 
82173a98491Sdv 	if (idx > vmc->vmc_nnics)
8223481ecdfSdv 		fatalx("%s: invalid vionet index: %u", __func__, idx);
82397f33f1dSdv 
8243481ecdfSdv 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
8253481ecdfSdv 		if (dev->dev_type == VMD_DEVTYPE_NET
8263481ecdfSdv 		    && dev->vionet.idx == idx) {
8273481ecdfSdv 			vionet = &dev->vionet;
8283481ecdfSdv 			break;
8293481ecdfSdv 		}
8303481ecdfSdv 	}
8313481ecdfSdv 	if (vionet == NULL)
8323481ecdfSdv 		fatalx("%s: dev == NULL, idx = %u", __func__, idx);
8333481ecdfSdv 
8343481ecdfSdv 	/* Set the local vm process copy. */
8353481ecdfSdv 	memcpy(vionet->hostmac, addr, sizeof(vionet->hostmac));
8363481ecdfSdv 
8373481ecdfSdv 	/* Send the information to the device process. */
8383481ecdfSdv 	ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_HOSTMAC, 0, 0, -1,
8393481ecdfSdv 	    vionet->hostmac, sizeof(vionet->hostmac));
8403481ecdfSdv 	if (ret == -1) {
8413481ecdfSdv 		log_warnx("%s: failed to queue hostmac to vionet dev %u",
8423481ecdfSdv 		    __func__, idx);
8433481ecdfSdv 		return;
8443481ecdfSdv 	}
84597f33f1dSdv }
84697f33f1dSdv 
84750bebf2cSccardenas void
84850bebf2cSccardenas virtio_shutdown(struct vmd_vm *vm)
84950bebf2cSccardenas {
8503481ecdfSdv 	int ret, status;
8513481ecdfSdv 	pid_t pid = 0;
8523481ecdfSdv 	struct virtio_dev *dev, *tmp;
8533481ecdfSdv 	struct viodev_msg msg;
8543481ecdfSdv 	struct imsgbuf *ibuf;
85550bebf2cSccardenas 
8563481ecdfSdv 	/* Ensure that our disks are synced. */
857a95a03dbSccardenas 	if (vioscsi != NULL)
858f6c09be3Sreyk 		vioscsi->file.close(vioscsi->file.p, 0);
859a95a03dbSccardenas 
8603481ecdfSdv 	/*
8613481ecdfSdv 	 * Broadcast shutdown to child devices. We need to do this
8623481ecdfSdv 	 * synchronously as we have already stopped the async event thread.
8633481ecdfSdv 	 */
8643481ecdfSdv 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
8653481ecdfSdv 		memset(&msg, 0, sizeof(msg));
8663481ecdfSdv 		msg.type = VIODEV_MSG_SHUTDOWN;
8673481ecdfSdv 		ibuf = &dev->sync_iev.ibuf;
8683481ecdfSdv 		ret = imsg_compose(ibuf, VIODEV_MSG_SHUTDOWN, 0, 0, -1,
8693481ecdfSdv 		    &msg, sizeof(msg));
8703481ecdfSdv 		if (ret == -1)
8713481ecdfSdv 			fatalx("%s: failed to send shutdown to device",
8723481ecdfSdv 			    __func__);
873dd7efffeSclaudio 		if (imsgbuf_flush(ibuf) == -1)
874dd7efffeSclaudio 			fatalx("%s: imsgbuf_flush", __func__);
8753481ecdfSdv 	}
8763481ecdfSdv 
8773481ecdfSdv 	/*
8783481ecdfSdv 	 * Wait for all children to shutdown using a simple approach of
8793481ecdfSdv 	 * iterating over known child devices and waiting for them to die.
8803481ecdfSdv 	 */
8813481ecdfSdv 	SLIST_FOREACH_SAFE(dev, &virtio_devs, dev_next, tmp) {
8823481ecdfSdv 		log_debug("%s: waiting on device pid %d", __func__,
8833481ecdfSdv 		    dev->dev_pid);
8843481ecdfSdv 		do {
8853481ecdfSdv 			pid = waitpid(dev->dev_pid, &status, WNOHANG);
8863481ecdfSdv 		} while (pid == 0 || (pid == -1 && errno == EINTR));
8873481ecdfSdv 		if (pid == dev->dev_pid)
8883481ecdfSdv 			log_debug("%s: device for pid %d is stopped",
8893481ecdfSdv 			    __func__, pid);
8903481ecdfSdv 		else
8913481ecdfSdv 			log_warnx("%s: unexpected pid %d", __func__, pid);
8923481ecdfSdv 		free(dev);
8933481ecdfSdv 	}
89450bebf2cSccardenas }
89550bebf2cSccardenas 
896149417b6Sreyk int
897149417b6Sreyk vmmci_restore(int fd, uint32_t vm_id)
898149417b6Sreyk {
899149417b6Sreyk 	log_debug("%s: receiving vmmci", __func__);
900149417b6Sreyk 	if (atomicio(read, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
901149417b6Sreyk 		log_warnx("%s: error reading vmmci from fd", __func__);
902149417b6Sreyk 		return (-1);
903149417b6Sreyk 	}
904149417b6Sreyk 
905813e3047Spd 	if (pci_set_bar_fn(vmmci.pci_id, 0, vmmci_io, NULL)) {
906813e3047Spd 		log_warnx("%s: can't set bar fn for vmm control device",
907149417b6Sreyk 		    __progname);
908149417b6Sreyk 		return (-1);
909149417b6Sreyk 	}
910149417b6Sreyk 	vmmci.vm_id = vm_id;
911353e9815Spd 	vmmci.irq = pci_get_dev_irq(vmmci.pci_id);
912149417b6Sreyk 	memset(&vmmci.timeout, 0, sizeof(struct event));
913149417b6Sreyk 	evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
914149417b6Sreyk 	return (0);
915149417b6Sreyk }
916149417b6Sreyk 
917149417b6Sreyk int
91873a98491Sdv viornd_restore(int fd, struct vmd_vm *vm)
919149417b6Sreyk {
92068534885Sdv 	void *hva = NULL;
92168534885Sdv 
922149417b6Sreyk 	log_debug("%s: receiving viornd", __func__);
923149417b6Sreyk 	if (atomicio(read, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
924149417b6Sreyk 		log_warnx("%s: error reading viornd from fd", __func__);
925149417b6Sreyk 		return (-1);
926149417b6Sreyk 	}
927813e3047Spd 	if (pci_set_bar_fn(viornd.pci_id, 0, virtio_rnd_io, NULL)) {
928813e3047Spd 		log_warnx("%s: can't set bar fn for virtio rng device",
929149417b6Sreyk 		    __progname);
930149417b6Sreyk 		return (-1);
931149417b6Sreyk 	}
93273a98491Sdv 	viornd.vm_id = vm->vm_params.vmc_params.vcp_id;
933353e9815Spd 	viornd.irq = pci_get_dev_irq(viornd.pci_id);
9347b7c4b51Smlarkin 
93568534885Sdv 	hva = hvaddr_mem(viornd.vq[0].q_gpa, vring_size(VIORND_QUEUE_SIZE));
93668534885Sdv 	if (hva == NULL)
93768534885Sdv 		fatal("failed to restore viornd virtqueue");
93868534885Sdv 	viornd.vq[0].q_hva = hva;
93968534885Sdv 
940149417b6Sreyk 	return (0);
941149417b6Sreyk }
942149417b6Sreyk 
943149417b6Sreyk int
944149417b6Sreyk vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
945149417b6Sreyk {
946149417b6Sreyk 	struct vmop_create_params *vmc = &vm->vm_params;
947149417b6Sreyk 	struct vm_create_params *vcp = &vmc->vmc_params;
9483481ecdfSdv 	struct virtio_dev *dev;
949813e3047Spd 	uint8_t i;
950149417b6Sreyk 
9513481ecdfSdv 	if (vmc->vmc_nnics == 0)
9523481ecdfSdv 		return (0);
9533481ecdfSdv 
9543481ecdfSdv 	for (i = 0; i < vmc->vmc_nnics; i++) {
9553481ecdfSdv 		dev = calloc(1, sizeof(struct virtio_dev));
9563481ecdfSdv 		if (dev == NULL) {
9573481ecdfSdv 			log_warn("%s: calloc failure allocating vionet",
958149417b6Sreyk 			    __progname);
959149417b6Sreyk 			return (-1);
960149417b6Sreyk 		}
9613481ecdfSdv 
9623481ecdfSdv 		log_debug("%s: receiving virtio network device", __func__);
9633481ecdfSdv 		if (atomicio(read, fd, dev, sizeof(struct virtio_dev))
9643481ecdfSdv 		    != sizeof(struct virtio_dev)) {
965149417b6Sreyk 			log_warnx("%s: error reading vionet from fd",
966149417b6Sreyk 			    __func__);
967149417b6Sreyk 			return (-1);
968149417b6Sreyk 		}
969149417b6Sreyk 
970149417b6Sreyk 		/* Virtio network */
9713481ecdfSdv 		if (dev->dev_type != VMD_DEVTYPE_NET) {
9723481ecdfSdv 			log_warnx("%s: invalid device type", __func__);
9733481ecdfSdv 			return (-1);
9743481ecdfSdv 		}
9753481ecdfSdv 
9763481ecdfSdv 		dev->sync_fd = -1;
9773481ecdfSdv 		dev->async_fd = -1;
9783481ecdfSdv 		dev->vm_id = vcp->vcp_id;
9793481ecdfSdv 		dev->vm_vmid = vm->vm_vmid;
9803481ecdfSdv 		dev->irq = pci_get_dev_irq(dev->pci_id);
9813481ecdfSdv 
9823481ecdfSdv 		if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) {
983813e3047Spd 			log_warnx("%s: can't set bar fn for virtio net "
984149417b6Sreyk 			    "device", __progname);
985149417b6Sreyk 			return (-1);
986149417b6Sreyk 		}
987149417b6Sreyk 
9883481ecdfSdv 		dev->vionet.data_fd = child_taps[i];
9893481ecdfSdv 		dev->vionet.idx = i;
990149417b6Sreyk 
9913481ecdfSdv 		SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
992149417b6Sreyk 	}
993149417b6Sreyk 
994149417b6Sreyk 	return (0);
995149417b6Sreyk }
996149417b6Sreyk 
997149417b6Sreyk int
99873a98491Sdv vioblk_restore(int fd, struct vmd_vm *vm,
99973613953Sreyk     int child_disks[][VM_MAX_BASE_PER_DISK])
1000149417b6Sreyk {
10013481ecdfSdv 	struct vmop_create_params *vmc = &vm->vm_params;
10023481ecdfSdv 	struct virtio_dev *dev;
10033481ecdfSdv 	uint8_t i, j;
1004149417b6Sreyk 
10053481ecdfSdv 	if (vmc->vmc_ndisks == 0)
10063481ecdfSdv 		return (0);
10073481ecdfSdv 
10083481ecdfSdv 	for (i = 0; i < vmc->vmc_ndisks; i++) {
10093481ecdfSdv 		dev = calloc(1, sizeof(struct virtio_dev));
10103481ecdfSdv 		if (dev == NULL) {
10113481ecdfSdv 			log_warn("%s: calloc failure allocating vioblks",
10123481ecdfSdv 			    __progname);
1013149417b6Sreyk 			return (-1);
1014149417b6Sreyk 		}
10153481ecdfSdv 
1016149417b6Sreyk 		log_debug("%s: receiving vioblk", __func__);
10173481ecdfSdv 		if (atomicio(read, fd, dev, sizeof(struct virtio_dev))
10183481ecdfSdv 		    != sizeof(struct virtio_dev)) {
1019149417b6Sreyk 			log_warnx("%s: error reading vioblk from fd", __func__);
1020149417b6Sreyk 			return (-1);
1021149417b6Sreyk 		}
10223481ecdfSdv 		if (dev->dev_type != VMD_DEVTYPE_DISK) {
10233481ecdfSdv 			log_warnx("%s: invalid device type", __func__);
10243481ecdfSdv 			return (-1);
10253481ecdfSdv 		}
10263481ecdfSdv 
10273481ecdfSdv 		dev->sync_fd = -1;
10283481ecdfSdv 		dev->async_fd = -1;
10293481ecdfSdv 
10303481ecdfSdv 		if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) {
1031813e3047Spd 			log_warnx("%s: can't set bar fn for virtio block "
1032149417b6Sreyk 			    "device", __progname);
1033149417b6Sreyk 			return (-1);
1034149417b6Sreyk 		}
10353481ecdfSdv 		dev->vm_id = vmc->vmc_params.vcp_id;
10363481ecdfSdv 		dev->irq = pci_get_dev_irq(dev->pci_id);
103768534885Sdv 
10383481ecdfSdv 		memset(&dev->vioblk.disk_fd, -1, sizeof(dev->vioblk.disk_fd));
10393481ecdfSdv 		dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i];
10403481ecdfSdv 		for (j = 0; j < dev->vioblk.ndisk_fd; j++)
10413481ecdfSdv 			dev->vioblk.disk_fd[j] = child_disks[i][j];
10423481ecdfSdv 
10433481ecdfSdv 		dev->vioblk.idx = i;
10443481ecdfSdv 		SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
1045149417b6Sreyk 	}
1046149417b6Sreyk 	return (0);
1047149417b6Sreyk }
1048149417b6Sreyk 
1049149417b6Sreyk int
105073a98491Sdv vioscsi_restore(int fd, struct vmd_vm *vm, int child_cdrom)
105195ab188fSccardenas {
105268534885Sdv 	void *hva = NULL;
105368534885Sdv 	unsigned int i;
105468534885Sdv 
105573a98491Sdv 	if (!strlen(vm->vm_params.vmc_cdrom))
105684dba573Spd 		return (0);
105784dba573Spd 
105895ab188fSccardenas 	vioscsi = calloc(1, sizeof(struct vioscsi_dev));
105995ab188fSccardenas 	if (vioscsi == NULL) {
106095ab188fSccardenas 		log_warn("%s: calloc failure allocating vioscsi", __progname);
106195ab188fSccardenas 		return (-1);
106295ab188fSccardenas 	}
106395ab188fSccardenas 
106495ab188fSccardenas 	log_debug("%s: receiving vioscsi", __func__);
106595ab188fSccardenas 
106695ab188fSccardenas 	if (atomicio(read, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
106795ab188fSccardenas 	    sizeof(struct vioscsi_dev)) {
106895ab188fSccardenas 		log_warnx("%s: error reading vioscsi from fd", __func__);
106995ab188fSccardenas 		return (-1);
107095ab188fSccardenas 	}
107195ab188fSccardenas 
107284dba573Spd 	if (pci_set_bar_fn(vioscsi->pci_id, 0, vioscsi_io, vioscsi)) {
107395ab188fSccardenas 		log_warnx("%s: can't set bar fn for vmm control device",
107495ab188fSccardenas 		    __progname);
107595ab188fSccardenas 		return (-1);
107695ab188fSccardenas 	}
107795ab188fSccardenas 
107873a98491Sdv 	vioscsi->vm_id = vm->vm_params.vmc_params.vcp_id;
1079353e9815Spd 	vioscsi->irq = pci_get_dev_irq(vioscsi->pci_id);
108095ab188fSccardenas 
108168534885Sdv 	/* vioscsi uses 3 virtqueues. */
108268534885Sdv 	for (i = 0; i < 3; i++) {
108368534885Sdv 		hva = hvaddr_mem(vioscsi->vq[i].q_gpa,
108468534885Sdv 		    vring_size(VIOSCSI_QUEUE_SIZE));
108568534885Sdv 		if (hva == NULL)
108668534885Sdv 			fatal("failed to restore vioscsi virtqueue");
108768534885Sdv 		vioscsi->vq[i].q_hva = hva;
108868534885Sdv 	}
108968534885Sdv 
109095ab188fSccardenas 	return (0);
109195ab188fSccardenas }
109295ab188fSccardenas 
109395ab188fSccardenas int
109473613953Sreyk virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
109573613953Sreyk     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1096149417b6Sreyk {
10973481ecdfSdv 	struct virtio_dev *dev;
1098149417b6Sreyk 	int ret;
1099149417b6Sreyk 
11003481ecdfSdv 	SLIST_INIT(&virtio_devs);
11013481ecdfSdv 
110273a98491Sdv 	if ((ret = viornd_restore(fd, vm)) == -1)
11033481ecdfSdv 		return (ret);
1104149417b6Sreyk 
110573a98491Sdv 	if ((ret = vioblk_restore(fd, vm, child_disks)) == -1)
11063481ecdfSdv 		return (ret);
1107149417b6Sreyk 
110873a98491Sdv 	if ((ret = vioscsi_restore(fd, vm, child_cdrom)) == -1)
11093481ecdfSdv 		return (ret);
111095ab188fSccardenas 
1111149417b6Sreyk 	if ((ret = vionet_restore(fd, vm, child_taps)) == -1)
11123481ecdfSdv 		return (ret);
1113149417b6Sreyk 
111473a98491Sdv 	if ((ret = vmmci_restore(fd, vm->vm_params.vmc_params.vcp_id)) == -1)
11153481ecdfSdv 		return (ret);
11163481ecdfSdv 
11173481ecdfSdv 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
11183481ecdfSdv 		if (virtio_dev_launch(vm, dev) != 0)
11193481ecdfSdv 			fatalx("%s: failed to restore virtio dev", __func__);
11203481ecdfSdv 	}
1121149417b6Sreyk 
1122149417b6Sreyk 	return (0);
1123149417b6Sreyk }
1124149417b6Sreyk 
1125149417b6Sreyk int
1126149417b6Sreyk viornd_dump(int fd)
1127149417b6Sreyk {
1128149417b6Sreyk 	log_debug("%s: sending viornd", __func__);
112968534885Sdv 
113068534885Sdv 	viornd.vq[0].q_hva = NULL;
113168534885Sdv 
1132149417b6Sreyk 	if (atomicio(vwrite, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
1133149417b6Sreyk 		log_warnx("%s: error writing viornd to fd", __func__);
1134149417b6Sreyk 		return (-1);
1135149417b6Sreyk 	}
1136149417b6Sreyk 	return (0);
1137149417b6Sreyk }
1138149417b6Sreyk 
1139149417b6Sreyk int
1140149417b6Sreyk vmmci_dump(int fd)
1141149417b6Sreyk {
1142149417b6Sreyk 	log_debug("%s: sending vmmci", __func__);
114368534885Sdv 
1144149417b6Sreyk 	if (atomicio(vwrite, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
1145149417b6Sreyk 		log_warnx("%s: error writing vmmci to fd", __func__);
1146149417b6Sreyk 		return (-1);
1147149417b6Sreyk 	}
1148149417b6Sreyk 	return (0);
1149149417b6Sreyk }
1150149417b6Sreyk 
1151149417b6Sreyk int
1152149417b6Sreyk vionet_dump(int fd)
1153149417b6Sreyk {
11543481ecdfSdv 	struct virtio_dev	*dev, temp;
11553481ecdfSdv 	struct viodev_msg	 msg;
11563481ecdfSdv 	struct imsg		 imsg;
11573481ecdfSdv 	struct imsgbuf		*ibuf = NULL;
11583481ecdfSdv 	size_t			 sz;
11593481ecdfSdv 	int			 ret;
116068534885Sdv 
11613481ecdfSdv 	log_debug("%s: dumping vionet", __func__);
116268534885Sdv 
11633481ecdfSdv 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
11643481ecdfSdv 		if (dev->dev_type != VMD_DEVTYPE_NET)
11653481ecdfSdv 			continue;
11663481ecdfSdv 
11673481ecdfSdv 		memset(&msg, 0, sizeof(msg));
11683481ecdfSdv 		memset(&imsg, 0, sizeof(imsg));
11693481ecdfSdv 
11703481ecdfSdv 		ibuf = &dev->sync_iev.ibuf;
11713481ecdfSdv 		msg.type = VIODEV_MSG_DUMP;
11723481ecdfSdv 
11733481ecdfSdv 		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
11743481ecdfSdv 		    sizeof(msg));
11753481ecdfSdv 		if (ret == -1) {
11763481ecdfSdv 			log_warnx("%s: failed requesting dump of vionet[%d]",
11773481ecdfSdv 			    __func__, dev->vionet.idx);
11783481ecdfSdv 			return (-1);
11793481ecdfSdv 		}
1180dd7efffeSclaudio 		if (imsgbuf_flush(ibuf) == -1) {
1181dd7efffeSclaudio 			log_warnx("%s: imsgbuf_flush", __func__);
11823481ecdfSdv 			return (-1);
118368534885Sdv 		}
118468534885Sdv 
11853481ecdfSdv 		sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp));
11863481ecdfSdv 		if (sz != sizeof(temp)) {
11873481ecdfSdv 			log_warnx("%s: failed to dump vionet[%d]", __func__,
11883481ecdfSdv 			    dev->vionet.idx);
11893481ecdfSdv 			return (-1);
11903481ecdfSdv 		}
11913481ecdfSdv 
119208d0da61Sdv 		/* Clear volatile state. Will reinitialize on restore. */
11933481ecdfSdv 		temp.vionet.vq[RXQ].q_hva = NULL;
11943481ecdfSdv 		temp.vionet.vq[TXQ].q_hva = NULL;
11953481ecdfSdv 		temp.async_fd = -1;
11963481ecdfSdv 		temp.sync_fd = -1;
11973481ecdfSdv 		memset(&temp.async_iev, 0, sizeof(temp.async_iev));
11983481ecdfSdv 		memset(&temp.sync_iev, 0, sizeof(temp.sync_iev));
11993481ecdfSdv 
12003481ecdfSdv 		if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) {
1201149417b6Sreyk 			log_warnx("%s: error writing vionet to fd", __func__);
1202149417b6Sreyk 			return (-1);
1203149417b6Sreyk 		}
12043481ecdfSdv 	}
12053481ecdfSdv 
1206149417b6Sreyk 	return (0);
1207149417b6Sreyk }
1208149417b6Sreyk 
1209149417b6Sreyk int
1210149417b6Sreyk vioblk_dump(int fd)
1211149417b6Sreyk {
12123481ecdfSdv 	struct virtio_dev	*dev, temp;
12133481ecdfSdv 	struct viodev_msg	 msg;
12143481ecdfSdv 	struct imsg		 imsg;
12153481ecdfSdv 	struct imsgbuf		*ibuf = NULL;
12163481ecdfSdv 	size_t			 sz;
12173481ecdfSdv 	int			 ret;
121868534885Sdv 
12193481ecdfSdv 	log_debug("%s: dumping vioblk", __func__);
122068534885Sdv 
12213481ecdfSdv 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
12223481ecdfSdv 		if (dev->dev_type != VMD_DEVTYPE_DISK)
12233481ecdfSdv 			continue;
122468534885Sdv 
12253481ecdfSdv 		memset(&msg, 0, sizeof(msg));
12263481ecdfSdv 		memset(&imsg, 0, sizeof(imsg));
12273481ecdfSdv 
12283481ecdfSdv 		ibuf = &dev->sync_iev.ibuf;
12293481ecdfSdv 		msg.type = VIODEV_MSG_DUMP;
12303481ecdfSdv 
12313481ecdfSdv 		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
12323481ecdfSdv 		    sizeof(msg));
12333481ecdfSdv 		if (ret == -1) {
12343481ecdfSdv 			log_warnx("%s: failed requesting dump of vioblk[%d]",
12353481ecdfSdv 			    __func__, dev->vioblk.idx);
12363481ecdfSdv 			return (-1);
12373481ecdfSdv 		}
1238dd7efffeSclaudio 		if (imsgbuf_flush(ibuf) == -1) {
1239dd7efffeSclaudio 			log_warnx("%s: imsgbuf_flush", __func__);
12403481ecdfSdv 			return (-1);
12413481ecdfSdv 		}
12423481ecdfSdv 
12433481ecdfSdv 
12443481ecdfSdv 		sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp));
12453481ecdfSdv 		if (sz != sizeof(temp)) {
12463481ecdfSdv 			log_warnx("%s: failed to dump vioblk[%d]", __func__,
12473481ecdfSdv 			    dev->vioblk.idx);
12483481ecdfSdv 			return (-1);
12493481ecdfSdv 		}
12503481ecdfSdv 
125108d0da61Sdv 		/* Clear volatile state. Will reinitialize on restore. */
12523481ecdfSdv 		temp.vioblk.vq[0].q_hva = NULL;
12533481ecdfSdv 		temp.async_fd = -1;
12543481ecdfSdv 		temp.sync_fd = -1;
12553481ecdfSdv 		memset(&temp.async_iev, 0, sizeof(temp.async_iev));
12563481ecdfSdv 		memset(&temp.sync_iev, 0, sizeof(temp.sync_iev));
12573481ecdfSdv 
12583481ecdfSdv 		if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) {
1259149417b6Sreyk 			log_warnx("%s: error writing vioblk to fd", __func__);
1260149417b6Sreyk 			return (-1);
1261149417b6Sreyk 		}
12623481ecdfSdv 	}
12633481ecdfSdv 
1264149417b6Sreyk 	return (0);
1265149417b6Sreyk }
1266149417b6Sreyk 
1267149417b6Sreyk int
126895ab188fSccardenas vioscsi_dump(int fd)
126995ab188fSccardenas {
127068534885Sdv 	unsigned int i;
127168534885Sdv 
127284dba573Spd 	if (vioscsi == NULL)
127384dba573Spd 		return (0);
127484dba573Spd 
127595ab188fSccardenas 	log_debug("%s: sending vioscsi", __func__);
127668534885Sdv 
127768534885Sdv 	for (i = 0; i < 3; i++)
127868534885Sdv 		vioscsi->vq[i].q_hva = NULL;
127968534885Sdv 
128084dba573Spd 	if (atomicio(vwrite, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
128184dba573Spd 	    sizeof(struct vioscsi_dev)) {
128295ab188fSccardenas 		log_warnx("%s: error writing vioscsi to fd", __func__);
128395ab188fSccardenas 		return (-1);
128495ab188fSccardenas 	}
128595ab188fSccardenas 	return (0);
128695ab188fSccardenas }
128795ab188fSccardenas 
128895ab188fSccardenas int
1289149417b6Sreyk virtio_dump(int fd)
1290149417b6Sreyk {
1291149417b6Sreyk 	int ret;
1292149417b6Sreyk 
1293149417b6Sreyk 	if ((ret = viornd_dump(fd)) == -1)
1294149417b6Sreyk 		return ret;
1295149417b6Sreyk 
1296149417b6Sreyk 	if ((ret = vioblk_dump(fd)) == -1)
1297149417b6Sreyk 		return ret;
1298149417b6Sreyk 
129995ab188fSccardenas 	if ((ret = vioscsi_dump(fd)) == -1)
130095ab188fSccardenas 		return ret;
130195ab188fSccardenas 
1302149417b6Sreyk 	if ((ret = vionet_dump(fd)) == -1)
1303149417b6Sreyk 		return ret;
1304149417b6Sreyk 
1305ebcaa090Smlarkin 	if ((ret = vmmci_dump(fd)) == -1)
1306149417b6Sreyk 		return ret;
1307149417b6Sreyk 
1308149417b6Sreyk 	return (0);
1309149417b6Sreyk }
1310548054a9Spd 
131108d0da61Sdv void virtio_broadcast_imsg(struct vmd_vm *vm, uint16_t type, void *data,
131208d0da61Sdv     uint16_t datalen)
1313548054a9Spd {
13143481ecdfSdv 	struct virtio_dev *dev;
13153481ecdfSdv 	int ret;
13163481ecdfSdv 
13173481ecdfSdv 	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
131808d0da61Sdv 		ret = imsg_compose_event(&dev->async_iev, type, 0, 0, -1, data,
131908d0da61Sdv 		    datalen);
13203481ecdfSdv 		if (ret == -1) {
132108d0da61Sdv 			log_warnx("%s: failed to broadcast imsg type %u",
132208d0da61Sdv 			    __func__, type);
1323548054a9Spd 		}
1324548054a9Spd 	}
132508d0da61Sdv 
132608d0da61Sdv }
132708d0da61Sdv 
132808d0da61Sdv void
132908d0da61Sdv virtio_stop(struct vmd_vm *vm)
133008d0da61Sdv {
133108d0da61Sdv 	return virtio_broadcast_imsg(vm, IMSG_VMDOP_PAUSE_VM, NULL, 0);
1332548054a9Spd }
1333548054a9Spd 
1334548054a9Spd void
133573a98491Sdv virtio_start(struct vmd_vm *vm)
1336548054a9Spd {
133708d0da61Sdv 	return virtio_broadcast_imsg(vm, IMSG_VMDOP_UNPAUSE_VM, NULL, 0);
13383481ecdfSdv }
13393481ecdfSdv 
13403481ecdfSdv /*
13413481ecdfSdv  * Fork+exec a child virtio device. Returns 0 on success.
13423481ecdfSdv  */
13433481ecdfSdv static int
13443481ecdfSdv virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev)
13453481ecdfSdv {
134608d0da61Sdv 	char *nargv[12], num[32], vmm_fd[32], vm_name[VM_NAME_MAX], t[2];
13473481ecdfSdv 	pid_t dev_pid;
134878979b66Sdv 	int sync_fds[2], async_fds[2], ret = 0;
13497f22b52aSbluhm 	size_t i, sz = 0;
13503481ecdfSdv 	struct viodev_msg msg;
1351b3bc6112Sdv 	struct virtio_dev *dev_entry;
13523481ecdfSdv 	struct imsg imsg;
13533481ecdfSdv 	struct imsgev *iev = &dev->sync_iev;
13543481ecdfSdv 
13553481ecdfSdv 	switch (dev->dev_type) {
13563481ecdfSdv 	case VMD_DEVTYPE_NET:
135708d0da61Sdv 		log_debug("%s: launching vionet%d",
13583481ecdfSdv 		    vm->vm_params.vmc_params.vcp_name, dev->vionet.idx);
13593481ecdfSdv 		break;
13603481ecdfSdv 	case VMD_DEVTYPE_DISK:
136108d0da61Sdv 		log_debug("%s: launching vioblk%d",
13623481ecdfSdv 		    vm->vm_params.vmc_params.vcp_name, dev->vioblk.idx);
13633481ecdfSdv 		break;
13643481ecdfSdv 		/* NOTREACHED */
13653481ecdfSdv 	default:
13663481ecdfSdv 		log_warn("%s: invalid device type", __func__);
13673481ecdfSdv 		return (EINVAL);
13683481ecdfSdv 	}
13693481ecdfSdv 
13703481ecdfSdv 	/* We need two channels: one synchronous (IO reads) and one async. */
1371b3bc6112Sdv 	if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, PF_UNSPEC,
1372b3bc6112Sdv 	    sync_fds) == -1) {
13733481ecdfSdv 		log_warn("failed to create socketpair");
13743481ecdfSdv 		return (errno);
13753481ecdfSdv 	}
1376b3bc6112Sdv 	if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, PF_UNSPEC,
1377b3bc6112Sdv 	    async_fds) == -1) {
13783481ecdfSdv 		log_warn("failed to create async socketpair");
13793481ecdfSdv 		return (errno);
13803481ecdfSdv 	}
13813481ecdfSdv 
13823481ecdfSdv 	/* Fork... */
13833481ecdfSdv 	dev_pid = fork();
13843481ecdfSdv 	if (dev_pid == -1) {
13853481ecdfSdv 		ret = errno;
13863481ecdfSdv 		log_warn("%s: fork failed", __func__);
13873481ecdfSdv 		goto err;
13883481ecdfSdv 	}
13893481ecdfSdv 
13903481ecdfSdv 	if (dev_pid > 0) {
13913481ecdfSdv 		/* Parent */
13923481ecdfSdv 		close_fd(sync_fds[1]);
13933481ecdfSdv 		close_fd(async_fds[1]);
13943481ecdfSdv 
13953481ecdfSdv 		/* Save the child's pid to help with cleanup. */
13963481ecdfSdv 		dev->dev_pid = dev_pid;
13973481ecdfSdv 
13983481ecdfSdv 		/* Set the channel fds to the child's before sending. */
13993481ecdfSdv 		dev->sync_fd = sync_fds[1];
14003481ecdfSdv 		dev->async_fd = async_fds[1];
14013481ecdfSdv 
14023481ecdfSdv 		/* 1. Send over our configured device. */
14033481ecdfSdv 		log_debug("%s: sending '%c' type device struct", __func__,
14043481ecdfSdv 			dev->dev_type);
14053481ecdfSdv 		sz = atomicio(vwrite, sync_fds[0], dev, sizeof(*dev));
14063481ecdfSdv 		if (sz != sizeof(*dev)) {
14073481ecdfSdv 			log_warnx("%s: failed to send device", __func__);
14083481ecdfSdv 			ret = EIO;
14093481ecdfSdv 			goto err;
14103481ecdfSdv 		}
14113481ecdfSdv 
141278979b66Sdv 		/* Close data fds. Only the child device needs them now. */
141378979b66Sdv 		if (virtio_dev_closefds(dev) == -1) {
141478979b66Sdv 			log_warnx("%s: failed to close device data fds",
141578979b66Sdv 			    __func__);
141678979b66Sdv 			goto err;
141778979b66Sdv 		}
141878979b66Sdv 
14193481ecdfSdv 		/* 2. Send over details on the VM (including memory fds). */
14203481ecdfSdv 		log_debug("%s: sending vm message for '%s'", __func__,
14213481ecdfSdv 			vm->vm_params.vmc_params.vcp_name);
14223481ecdfSdv 		sz = atomicio(vwrite, sync_fds[0], vm, sizeof(*vm));
14233481ecdfSdv 		if (sz != sizeof(*vm)) {
14243481ecdfSdv 			log_warnx("%s: failed to send vm details", __func__);
14253481ecdfSdv 			ret = EIO;
14263481ecdfSdv 			goto err;
14273481ecdfSdv 		}
14283481ecdfSdv 
14293481ecdfSdv 		/*
14303481ecdfSdv 		 * Initialize our imsg channel to the child device. The initial
14313481ecdfSdv 		 * communication will be synchronous. We expect the child to
14323481ecdfSdv 		 * report itself "ready" to confirm the launch was a success.
14333481ecdfSdv 		 */
14340a9d031fSclaudio 		if (imsgbuf_init(&iev->ibuf, sync_fds[0]) == -1) {
14350a9d031fSclaudio 			log_warn("%s: failed to init imsgbuf", __func__);
14360a9d031fSclaudio 			goto err;
14370a9d031fSclaudio 		}
14380a9d031fSclaudio 		imsgbuf_allow_fdpass(&iev->ibuf);
1439d12ef5f3Sclaudio 		ret = imsgbuf_read_one(&iev->ibuf, &imsg);
14403481ecdfSdv 		if (ret == 0 || ret == -1) {
14413481ecdfSdv 			log_warnx("%s: failed to receive ready message from "
14423481ecdfSdv 			    "'%c' type device", __func__, dev->dev_type);
14433481ecdfSdv 			ret = EIO;
14443481ecdfSdv 			goto err;
14453481ecdfSdv 		}
14463481ecdfSdv 		ret = 0;
14473481ecdfSdv 
14483481ecdfSdv 		IMSG_SIZE_CHECK(&imsg, &msg);
14493481ecdfSdv 		memcpy(&msg, imsg.data, sizeof(msg));
14503481ecdfSdv 		imsg_free(&imsg);
14513481ecdfSdv 
14523481ecdfSdv 		if (msg.type != VIODEV_MSG_READY) {
14533481ecdfSdv 			log_warnx("%s: expected ready message, got type %d",
14543481ecdfSdv 			    __func__, msg.type);
14553481ecdfSdv 			ret = EINVAL;
14563481ecdfSdv 			goto err;
14573481ecdfSdv 		}
14583481ecdfSdv 		log_debug("%s: device reports ready via sync channel",
14593481ecdfSdv 		    __func__);
14603481ecdfSdv 
14613481ecdfSdv 		/*
14623481ecdfSdv 		 * Wire in the async event handling, but after reverting back
14633481ecdfSdv 		 * to the parent's fd's.
14643481ecdfSdv 		 */
14653481ecdfSdv 		dev->sync_fd = sync_fds[0];
14663481ecdfSdv 		dev->async_fd = async_fds[0];
1467a246f7a0Sdv 		vm_device_pipe(dev, virtio_dispatch_dev, NULL);
14683481ecdfSdv 	} else {
14693481ecdfSdv 		/* Child */
14703481ecdfSdv 		close_fd(async_fds[0]);
14713481ecdfSdv 		close_fd(sync_fds[0]);
14723481ecdfSdv 
1473b3bc6112Sdv 		/* Close pty. Virtio devices do not need it. */
1474b3bc6112Sdv 		close_fd(vm->vm_tty);
1475b3bc6112Sdv 		vm->vm_tty = -1;
1476b3bc6112Sdv 
1477b3bc6112Sdv 		if (vm->vm_cdrom != -1) {
1478b3bc6112Sdv 			close_fd(vm->vm_cdrom);
1479b3bc6112Sdv 			vm->vm_cdrom = -1;
14803481ecdfSdv 		}
1481b3bc6112Sdv 
1482b3bc6112Sdv 		/* Keep data file descriptors open after exec. */
1483b3bc6112Sdv 		SLIST_FOREACH(dev_entry, &virtio_devs, dev_next) {
1484b3bc6112Sdv 			if (dev_entry == dev)
1485b3bc6112Sdv 				continue;
1486b3bc6112Sdv 			if (virtio_dev_closefds(dev_entry) == -1)
1487b3bc6112Sdv 				fatalx("unable to close other virtio devs");
14883481ecdfSdv 		}
14893481ecdfSdv 
14903481ecdfSdv 		memset(num, 0, sizeof(num));
14913481ecdfSdv 		snprintf(num, sizeof(num), "%d", sync_fds[1]);
14923c817da7Sdv 		memset(vmm_fd, 0, sizeof(vmm_fd));
14933c817da7Sdv 		snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd);
149408d0da61Sdv 		memset(vm_name, 0, sizeof(vm_name));
149508d0da61Sdv 		snprintf(vm_name, sizeof(vm_name), "%s",
149608d0da61Sdv 		    vm->vm_params.vmc_params.vcp_name);
14973481ecdfSdv 
14983481ecdfSdv 		t[0] = dev->dev_type;
14993481ecdfSdv 		t[1] = '\0';
15003481ecdfSdv 
15017f22b52aSbluhm 		i = 0;
15027f22b52aSbluhm 		nargv[i++] = env->argv0;
15037f22b52aSbluhm 		nargv[i++] = "-X";
15047f22b52aSbluhm 		nargv[i++] = num;
15057f22b52aSbluhm 		nargv[i++] = "-t";
15067f22b52aSbluhm 		nargv[i++] = t;
15077f22b52aSbluhm 		nargv[i++] = "-i";
15087f22b52aSbluhm 		nargv[i++] = vmm_fd;
15097f22b52aSbluhm 		nargv[i++] = "-p";
15107f22b52aSbluhm 		nargv[i++] = vm_name;
15117f22b52aSbluhm 		if (env->vmd_debug)
15127f22b52aSbluhm 			nargv[i++] = "-d";
15137f22b52aSbluhm 		if (env->vmd_verbose == 1)
15147f22b52aSbluhm 			nargv[i++] = "-v";
15157f22b52aSbluhm 		else if (env->vmd_verbose > 1)
15167f22b52aSbluhm 			nargv[i++] = "-vv";
15177f22b52aSbluhm 		nargv[i++] = NULL;
15187f22b52aSbluhm 		if (i > sizeof(nargv) / sizeof(nargv[0]))
15197f22b52aSbluhm 			fatalx("%s: nargv overflow", __func__);
152061f4cd73Sdv 
15213481ecdfSdv 		/* Control resumes in vmd.c:main(). */
15223481ecdfSdv 		execvp(nargv[0], nargv);
15233481ecdfSdv 
15243481ecdfSdv 		ret = errno;
15253481ecdfSdv 		log_warn("%s: failed to exec device", __func__);
15263481ecdfSdv 		_exit(ret);
15273481ecdfSdv 		/* NOTREACHED */
15283481ecdfSdv 	}
15293481ecdfSdv 
15303481ecdfSdv 	return (ret);
15313481ecdfSdv 
15323481ecdfSdv err:
15333481ecdfSdv 	close_fd(sync_fds[0]);
15343481ecdfSdv 	close_fd(sync_fds[1]);
15353481ecdfSdv 	close_fd(async_fds[0]);
15363481ecdfSdv 	close_fd(async_fds[1]);
15373481ecdfSdv 	return (ret);
15383481ecdfSdv }
15393481ecdfSdv 
15403481ecdfSdv /*
15413481ecdfSdv  * Initialize an async imsg channel for a virtio device.
15423481ecdfSdv  */
15433481ecdfSdv int
1544a246f7a0Sdv vm_device_pipe(struct virtio_dev *dev, void (*cb)(int, short, void *),
1545a246f7a0Sdv     struct event_base *ev_base)
15463481ecdfSdv {
15473481ecdfSdv 	struct imsgev *iev = &dev->async_iev;
15483481ecdfSdv 	int fd = dev->async_fd;
15493481ecdfSdv 
15503481ecdfSdv 	log_debug("%s: initializing '%c' device pipe (fd=%d)", __func__,
15513481ecdfSdv 	    dev->dev_type, fd);
15523481ecdfSdv 
15530a9d031fSclaudio 	if (imsgbuf_init(&iev->ibuf, fd) == -1)
15540a9d031fSclaudio 		fatal("imsgbuf_init");
15550a9d031fSclaudio 	imsgbuf_allow_fdpass(&iev->ibuf);
15563481ecdfSdv 	iev->handler = cb;
15573481ecdfSdv 	iev->data = dev;
15583481ecdfSdv 	iev->events = EV_READ;
1559a246f7a0Sdv 	imsg_event_add2(iev, ev_base);
15603481ecdfSdv 
15613481ecdfSdv 	return (0);
15623481ecdfSdv }
15633481ecdfSdv 
15643481ecdfSdv void
15653481ecdfSdv virtio_dispatch_dev(int fd, short event, void *arg)
15663481ecdfSdv {
15673481ecdfSdv 	struct virtio_dev	*dev = (struct virtio_dev*)arg;
15683481ecdfSdv 	struct imsgev		*iev = &dev->async_iev;
15693481ecdfSdv 	struct imsgbuf		*ibuf = &iev->ibuf;
15703481ecdfSdv 	struct imsg		 imsg;
15713481ecdfSdv 	struct viodev_msg	 msg;
15723481ecdfSdv 	ssize_t			 n = 0;
15733481ecdfSdv 
15743481ecdfSdv 	if (event & EV_READ) {
1575d12ef5f3Sclaudio 		if ((n = imsgbuf_read(ibuf)) == -1)
1576dd7efffeSclaudio 			fatal("%s: imsgbuf_read", __func__);
15773481ecdfSdv 		if (n == 0) {
15783481ecdfSdv 			/* this pipe is dead, so remove the event handler */
15793481ecdfSdv 			log_debug("%s: pipe dead (EV_READ)", __func__);
15803481ecdfSdv 			event_del(&iev->ev);
15813481ecdfSdv 			event_loopexit(NULL);
1582548054a9Spd 			return;
1583548054a9Spd 		}
1584548054a9Spd 	}
15853481ecdfSdv 
15863481ecdfSdv 	if (event & EV_WRITE) {
1587dd7efffeSclaudio 		if (imsgbuf_write(ibuf) == -1) {
1588c1aa9554Sclaudio 			if (errno == EPIPE) {
1589c1aa9554Sclaudio 				/* this pipe is dead, remove the handler */
15903481ecdfSdv 				log_debug("%s: pipe dead (EV_WRITE)", __func__);
15913481ecdfSdv 				event_del(&iev->ev);
15923481ecdfSdv 				event_loopexit(NULL);
15933481ecdfSdv 				return;
15943481ecdfSdv 			}
1595dd7efffeSclaudio 			fatal("%s: imsgbuf_write", __func__);
1596c1aa9554Sclaudio 		}
15973481ecdfSdv 	}
15983481ecdfSdv 
15993481ecdfSdv 	for (;;) {
16003481ecdfSdv 		if ((n = imsg_get(ibuf, &imsg)) == -1)
16013481ecdfSdv 			fatal("%s: imsg_get", __func__);
16023481ecdfSdv 		if (n == 0)
16033481ecdfSdv 			break;
16043481ecdfSdv 
16053481ecdfSdv 		switch (imsg.hdr.type) {
16063481ecdfSdv 		case IMSG_DEVOP_MSG:
16073481ecdfSdv 			IMSG_SIZE_CHECK(&imsg, &msg);
16083481ecdfSdv 			memcpy(&msg, imsg.data, sizeof(msg));
16093481ecdfSdv 			handle_dev_msg(&msg, dev);
16103481ecdfSdv 			break;
16113481ecdfSdv 		default:
16123481ecdfSdv 			log_warnx("%s: got non devop imsg %d", __func__,
16133481ecdfSdv 			    imsg.hdr.type);
16143481ecdfSdv 			break;
16153481ecdfSdv 		}
16163481ecdfSdv 		imsg_free(&imsg);
16173481ecdfSdv 	}
16183481ecdfSdv 	imsg_event_add(iev);
16193481ecdfSdv }
16203481ecdfSdv 
16213481ecdfSdv 
16223481ecdfSdv static int
16233481ecdfSdv handle_dev_msg(struct viodev_msg *msg, struct virtio_dev *gdev)
16243481ecdfSdv {
16253481ecdfSdv 	uint32_t vm_id = gdev->vm_id;
16263481ecdfSdv 	int irq = gdev->irq;
16273481ecdfSdv 
16283481ecdfSdv 	switch (msg->type) {
16293481ecdfSdv 	case VIODEV_MSG_KICK:
16303481ecdfSdv 		if (msg->state == INTR_STATE_ASSERT)
1631c4fd4c5bSdv 			vcpu_assert_irq(vm_id, msg->vcpu, irq);
16323481ecdfSdv 		else if (msg->state == INTR_STATE_DEASSERT)
1633c4fd4c5bSdv 			vcpu_deassert_irq(vm_id, msg->vcpu, irq);
16343481ecdfSdv 		break;
16353481ecdfSdv 	case VIODEV_MSG_READY:
16363481ecdfSdv 		log_debug("%s: device reports ready", __func__);
16373481ecdfSdv 		break;
16383481ecdfSdv 	case VIODEV_MSG_ERROR:
16393481ecdfSdv 		log_warnx("%s: device reported error", __func__);
16403481ecdfSdv 		break;
16413481ecdfSdv 	case VIODEV_MSG_INVALID:
16423481ecdfSdv 	case VIODEV_MSG_IO_READ:
16433481ecdfSdv 	case VIODEV_MSG_IO_WRITE:
16443481ecdfSdv 		/* FALLTHROUGH */
16453481ecdfSdv 	default:
16463481ecdfSdv 		log_warnx("%s: unsupported device message type %d", __func__,
16473481ecdfSdv 		    msg->type);
16483481ecdfSdv 		return (1);
16493481ecdfSdv 	}
16503481ecdfSdv 
16513481ecdfSdv 	return (0);
16523481ecdfSdv };
16533481ecdfSdv 
16543481ecdfSdv /*
16553481ecdfSdv  * Called by the VM process while processing IO from the VCPU thread.
16563481ecdfSdv  *
16573481ecdfSdv  * N.b. Since the VCPU thread calls this function, we cannot mutate the event
16583481ecdfSdv  * system. All ipc messages must be sent manually and cannot be queued for
16593481ecdfSdv  * the event loop to push them. (We need to perform a synchronous read, so
16603481ecdfSdv  * this isn't really a big deal.)
16613481ecdfSdv  */
16623481ecdfSdv int
16633481ecdfSdv virtio_pci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
16643481ecdfSdv     void *cookie, uint8_t sz)
16653481ecdfSdv {
16663481ecdfSdv 	struct virtio_dev *dev = (struct virtio_dev *)cookie;
16673481ecdfSdv 	struct imsgbuf *ibuf = &dev->sync_iev.ibuf;
16683481ecdfSdv 	struct imsg imsg;
16693481ecdfSdv 	struct viodev_msg msg;
16703481ecdfSdv 	int ret = 0;
16713481ecdfSdv 
16723481ecdfSdv 	memset(&msg, 0, sizeof(msg));
16733481ecdfSdv 	msg.reg = reg;
16743481ecdfSdv 	msg.io_sz = sz;
16753481ecdfSdv 
16763481ecdfSdv 	if (dir == 0) {
16773481ecdfSdv 		msg.type = VIODEV_MSG_IO_WRITE;
16783481ecdfSdv 		msg.data = *data;
16793481ecdfSdv 		msg.data_valid = 1;
16803481ecdfSdv 	} else
16813481ecdfSdv 		msg.type = VIODEV_MSG_IO_READ;
16823481ecdfSdv 
16833481ecdfSdv 	if (msg.type == VIODEV_MSG_IO_WRITE) {
16843481ecdfSdv 		/*
16853481ecdfSdv 		 * Write request. No reply expected.
16863481ecdfSdv 		 */
16873481ecdfSdv 		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
16883481ecdfSdv 		    sizeof(msg));
16893481ecdfSdv 		if (ret == -1) {
16904cd4f486Sdv 			log_warn("%s: failed to send async io event to virtio"
16913481ecdfSdv 			    " device", __func__);
16923481ecdfSdv 			return (ret);
16933481ecdfSdv 		}
1694dd7efffeSclaudio 		if (imsgbuf_flush(ibuf) == -1) {
1695dd7efffeSclaudio 			log_warnx("%s: imsgbuf_flush (write)", __func__);
16963481ecdfSdv 			return (-1);
16973481ecdfSdv 		}
16983481ecdfSdv 	} else {
16993481ecdfSdv 		/*
17003481ecdfSdv 		 * Read request. Requires waiting for a reply.
17013481ecdfSdv 		 */
17023481ecdfSdv 		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
17033481ecdfSdv 		    sizeof(msg));
17043481ecdfSdv 		if (ret == -1) {
17054cd4f486Sdv 			log_warnx("%s: failed to send sync io event to virtio"
17063481ecdfSdv 			    " device", __func__);
17073481ecdfSdv 			return (ret);
17083481ecdfSdv 		}
1709dd7efffeSclaudio 		if (imsgbuf_flush(ibuf) == -1) {
1710dd7efffeSclaudio 			log_warnx("%s: imsgbuf_flush (read)", __func__);
17113481ecdfSdv 			return (-1);
17123481ecdfSdv 		}
17133481ecdfSdv 
17143481ecdfSdv 		/* Read our reply. */
1715d12ef5f3Sclaudio 		ret = imsgbuf_read_one(ibuf, &imsg);
1716d12ef5f3Sclaudio 		if (ret == 0 || ret == -1) {
1717d12ef5f3Sclaudio 			log_warn("%s: imsgbuf_read (n=%d)", __func__, ret);
17183481ecdfSdv 			return (-1);
17193481ecdfSdv 		}
17203481ecdfSdv 		IMSG_SIZE_CHECK(&imsg, &msg);
17213481ecdfSdv 		memcpy(&msg, imsg.data, sizeof(msg));
17223481ecdfSdv 		imsg_free(&imsg);
17233481ecdfSdv 
17243481ecdfSdv 		if (msg.type == VIODEV_MSG_IO_READ && msg.data_valid) {
172561f4cd73Sdv #if DEBUG
17263481ecdfSdv 			log_debug("%s: got sync read response (reg=%s)",
17273481ecdfSdv 			    __func__, virtio_reg_name(msg.reg));
172861f4cd73Sdv #endif /* DEBUG */
17293481ecdfSdv 			*data = msg.data;
17303481ecdfSdv 			/*
17313481ecdfSdv 			 * It's possible we're asked to {de,}assert after the
17323481ecdfSdv 			 * device performs a register read.
17333481ecdfSdv 			 */
17343481ecdfSdv 			if (msg.state == INTR_STATE_ASSERT)
1735c4fd4c5bSdv 				vcpu_assert_irq(dev->vm_id, msg.vcpu, msg.irq);
17363481ecdfSdv 			else if (msg.state == INTR_STATE_DEASSERT)
1737c4fd4c5bSdv 				vcpu_deassert_irq(dev->vm_id, msg.vcpu, msg.irq);
17383481ecdfSdv 		} else {
17393481ecdfSdv 			log_warnx("%s: expected IO_READ, got %d", __func__,
17403481ecdfSdv 			    msg.type);
17413481ecdfSdv 			return (-1);
17423481ecdfSdv 		}
17433481ecdfSdv 	}
17443481ecdfSdv 
17453481ecdfSdv 	return (0);
17463481ecdfSdv }
17473481ecdfSdv 
17483481ecdfSdv void
1749c4fd4c5bSdv virtio_assert_irq(struct virtio_dev *dev, int vcpu)
17503481ecdfSdv {
17513481ecdfSdv 	struct viodev_msg msg;
17523481ecdfSdv 	int ret;
17533481ecdfSdv 
17543481ecdfSdv 	memset(&msg, 0, sizeof(msg));
17553481ecdfSdv 	msg.irq = dev->irq;
17563481ecdfSdv 	msg.vcpu = vcpu;
17573481ecdfSdv 	msg.type = VIODEV_MSG_KICK;
17583481ecdfSdv 	msg.state = INTR_STATE_ASSERT;
17593481ecdfSdv 
17603481ecdfSdv 	ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
17613481ecdfSdv 	    &msg, sizeof(msg));
17623481ecdfSdv 	if (ret == -1)
17633481ecdfSdv 		log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
17643481ecdfSdv }
17653481ecdfSdv 
17663481ecdfSdv void
1767c4fd4c5bSdv virtio_deassert_irq(struct virtio_dev *dev, int vcpu)
17683481ecdfSdv {
17693481ecdfSdv 	struct viodev_msg msg;
17703481ecdfSdv 	int ret;
17713481ecdfSdv 
17723481ecdfSdv 	memset(&msg, 0, sizeof(msg));
17733481ecdfSdv 	msg.irq = dev->irq;
17743481ecdfSdv 	msg.vcpu = vcpu;
17753481ecdfSdv 	msg.type = VIODEV_MSG_KICK;
17763481ecdfSdv 	msg.state = INTR_STATE_DEASSERT;
17773481ecdfSdv 
17783481ecdfSdv 	ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
17793481ecdfSdv 	    &msg, sizeof(msg));
17803481ecdfSdv 	if (ret == -1)
17813481ecdfSdv 		log_warnx("%s: failed to deassert irq %d", __func__, dev->irq);
1782548054a9Spd }
1783b3bc6112Sdv 
1784b3bc6112Sdv /*
1785b3bc6112Sdv  * Close all underlying file descriptors for a given virtio device.
1786b3bc6112Sdv  */
1787b3bc6112Sdv static int
1788b3bc6112Sdv virtio_dev_closefds(struct virtio_dev *dev)
1789b3bc6112Sdv {
1790b3bc6112Sdv 	size_t i;
1791b3bc6112Sdv 
1792b3bc6112Sdv 	switch (dev->dev_type) {
1793b3bc6112Sdv 		case VMD_DEVTYPE_DISK:
1794b3bc6112Sdv 			for (i = 0; i < dev->vioblk.ndisk_fd; i++) {
1795b3bc6112Sdv 				close_fd(dev->vioblk.disk_fd[i]);
1796b3bc6112Sdv 				dev->vioblk.disk_fd[i] = -1;
1797b3bc6112Sdv 			}
1798b3bc6112Sdv 			break;
1799b3bc6112Sdv 		case VMD_DEVTYPE_NET:
1800b3bc6112Sdv 			close_fd(dev->vionet.data_fd);
1801b3bc6112Sdv 			dev->vionet.data_fd = -1;
1802b3bc6112Sdv 			break;
1803b3bc6112Sdv 	default:
1804b3bc6112Sdv 		log_warnx("%s: invalid device type", __func__);
1805b3bc6112Sdv 		return (-1);
1806b3bc6112Sdv 	}
1807b3bc6112Sdv 
180878979b66Sdv 	close_fd(dev->async_fd);
180978979b66Sdv 	dev->async_fd = -1;
181078979b66Sdv 	close_fd(dev->sync_fd);
181178979b66Sdv 	dev->sync_fd = -1;
181278979b66Sdv 
1813b3bc6112Sdv 	return (0);
1814b3bc6112Sdv }
1815