xref: /openbsd-src/usr.sbin/vmd/virtio.c (revision 8550894424f8a4aa4aafb6cd57229dd6ed7cd9dd)
1 /*	$OpenBSD: virtio.c,v 1.99 2022/12/28 21:30:19 jmc Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* PAGE_SIZE */
20 #include <sys/socket.h>
21 
22 #include <machine/vmmvar.h>
23 #include <dev/pci/pcireg.h>
24 #include <dev/pci/pcidevs.h>
25 #include <dev/pv/virtioreg.h>
26 #include <dev/pci/virtio_pcireg.h>
27 #include <dev/pv/vioblkreg.h>
28 #include <dev/pv/vioscsireg.h>
29 
30 #include <net/if.h>
31 #include <netinet/in.h>
32 #include <netinet/if_ether.h>
33 #include <netinet/ip.h>
34 
35 #include <errno.h>
36 #include <event.h>
37 #include <poll.h>
38 #include <stddef.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <unistd.h>
42 
43 #include "atomicio.h"
44 #include "pci.h"
45 #include "vioscsi.h"
46 #include "virtio.h"
47 #include "vmd.h"
48 #include "vmm.h"
49 
50 extern char *__progname;
51 struct viornd_dev viornd;
52 struct vioblk_dev *vioblk;
53 struct vionet_dev *vionet;
54 struct vioscsi_dev *vioscsi;
55 struct vmmci_dev vmmci;
56 
57 int nr_vionet;
58 int nr_vioblk;
59 
60 #define MAXPHYS	(64 * 1024)	/* max raw I/O transfer size */
61 
62 #define VIRTIO_NET_F_MAC	(1<<5)
63 
64 #define VMMCI_F_TIMESYNC	(1<<0)
65 #define VMMCI_F_ACK		(1<<1)
66 #define VMMCI_F_SYNCRTC		(1<<2)
67 
68 #define RXQ	0
69 #define TXQ	1
70 
71 const char *
72 vioblk_cmd_name(uint32_t type)
73 {
74 	switch (type) {
75 	case VIRTIO_BLK_T_IN: return "read";
76 	case VIRTIO_BLK_T_OUT: return "write";
77 	case VIRTIO_BLK_T_SCSI_CMD: return "scsi read";
78 	case VIRTIO_BLK_T_SCSI_CMD_OUT: return "scsi write";
79 	case VIRTIO_BLK_T_FLUSH: return "flush";
80 	case VIRTIO_BLK_T_FLUSH_OUT: return "flush out";
81 	case VIRTIO_BLK_T_GET_ID: return "get id";
82 	default: return "unknown";
83 	}
84 }
85 
86 static const char *
87 virtio_reg_name(uint8_t reg)
88 {
89 	switch (reg) {
90 	case VIRTIO_CONFIG_DEVICE_FEATURES: return "device feature";
91 	case VIRTIO_CONFIG_GUEST_FEATURES: return "guest feature";
92 	case VIRTIO_CONFIG_QUEUE_PFN: return "queue address";
93 	case VIRTIO_CONFIG_QUEUE_SIZE: return "queue size";
94 	case VIRTIO_CONFIG_QUEUE_SELECT: return "queue select";
95 	case VIRTIO_CONFIG_QUEUE_NOTIFY: return "queue notify";
96 	case VIRTIO_CONFIG_DEVICE_STATUS: return "device status";
97 	case VIRTIO_CONFIG_ISR_STATUS: return "isr status";
98 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI: return "device config 0";
99 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: return "device config 1";
100 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: return "device config 2";
101 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: return "device config 3";
102 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: return "device config 4";
103 	default: return "unknown";
104 	}
105 }
106 
107 uint32_t
108 vring_size(uint32_t vq_size)
109 {
110 	uint32_t allocsize1, allocsize2;
111 
112 	/* allocsize1: descriptor table + avail ring + pad */
113 	allocsize1 = VIRTQUEUE_ALIGN(sizeof(struct vring_desc) * vq_size
114 	    + sizeof(uint16_t) * (2 + vq_size));
115 	/* allocsize2: used ring + pad */
116 	allocsize2 = VIRTQUEUE_ALIGN(sizeof(uint16_t) * 2
117 	    + sizeof(struct vring_used_elem) * vq_size);
118 
119 	return allocsize1 + allocsize2;
120 }
121 
122 /* Update queue select */
123 void
124 viornd_update_qs(void)
125 {
126 	struct virtio_vq_info *vq_info;
127 
128 	/* Invalid queue? */
129 	if (viornd.cfg.queue_select > 0) {
130 		viornd.cfg.queue_size = 0;
131 		return;
132 	}
133 
134 	vq_info = &viornd.vq[viornd.cfg.queue_select];
135 
136 	/* Update queue pfn/size based on queue select */
137 	viornd.cfg.queue_pfn = vq_info->q_gpa >> 12;
138 	viornd.cfg.queue_size = vq_info->qs;
139 }
140 
141 /* Update queue address */
142 void
143 viornd_update_qa(void)
144 {
145 	struct virtio_vq_info *vq_info;
146 	void *hva = NULL;
147 
148 	/* Invalid queue? */
149 	if (viornd.cfg.queue_select > 0)
150 		return;
151 
152 	vq_info = &viornd.vq[viornd.cfg.queue_select];
153 	vq_info->q_gpa = (uint64_t)viornd.cfg.queue_pfn * VIRTIO_PAGE_SIZE;
154 
155 	hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIORND_QUEUE_SIZE));
156 	if (hva == NULL)
157 		fatal("viornd_update_qa");
158 	vq_info->q_hva = hva;
159 }
160 
161 int
162 viornd_notifyq(void)
163 {
164 	size_t sz;
165 	int dxx, ret;
166 	uint16_t aidx, uidx;
167 	char *vr, *rnd_data;
168 	struct vring_desc *desc;
169 	struct vring_avail *avail;
170 	struct vring_used *used;
171 	struct virtio_vq_info *vq_info;
172 
173 	ret = 0;
174 
175 	/* Invalid queue? */
176 	if (viornd.cfg.queue_notify > 0)
177 		return (0);
178 
179 	vq_info = &viornd.vq[viornd.cfg.queue_notify];
180 	vr = vq_info->q_hva;
181 	if (vr == NULL)
182 		fatalx("%s: null vring", __func__);
183 
184 	desc = (struct vring_desc *)(vr);
185 	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
186 	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
187 
188 	aidx = avail->idx & VIORND_QUEUE_MASK;
189 	uidx = used->idx & VIORND_QUEUE_MASK;
190 
191 	dxx = avail->ring[aidx] & VIORND_QUEUE_MASK;
192 
193 	sz = desc[dxx].len;
194 	if (sz > MAXPHYS)
195 		fatalx("viornd descriptor size too large (%zu)", sz);
196 
197 	rnd_data = malloc(sz);
198 
199 	if (rnd_data != NULL) {
200 		arc4random_buf(rnd_data, sz);
201 		if (write_mem(desc[dxx].addr, rnd_data, sz)) {
202 			log_warnx("viornd: can't write random data @ "
203 			    "0x%llx",
204 			    desc[dxx].addr);
205 		} else {
206 			/* ret == 1 -> interrupt needed */
207 			/* XXX check VIRTIO_F_NO_INTR */
208 			ret = 1;
209 			viornd.cfg.isr_status = 1;
210 			used->ring[uidx].id = dxx;
211 			used->ring[uidx].len = sz;
212 			__sync_synchronize();
213 			used->idx++;
214 		}
215 		free(rnd_data);
216 	} else
217 		fatal("memory allocation error for viornd data");
218 
219 	return (ret);
220 }
221 
222 int
223 virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
224     void *unused, uint8_t sz)
225 {
226 	*intr = 0xFF;
227 
228 	if (dir == 0) {
229 		switch (reg) {
230 		case VIRTIO_CONFIG_DEVICE_FEATURES:
231 		case VIRTIO_CONFIG_QUEUE_SIZE:
232 		case VIRTIO_CONFIG_ISR_STATUS:
233 			log_warnx("%s: illegal write %x to %s",
234 			    __progname, *data, virtio_reg_name(reg));
235 			break;
236 		case VIRTIO_CONFIG_GUEST_FEATURES:
237 			viornd.cfg.guest_feature = *data;
238 			break;
239 		case VIRTIO_CONFIG_QUEUE_PFN:
240 			viornd.cfg.queue_pfn = *data;
241 			viornd_update_qa();
242 			break;
243 		case VIRTIO_CONFIG_QUEUE_SELECT:
244 			viornd.cfg.queue_select = *data;
245 			viornd_update_qs();
246 			break;
247 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
248 			viornd.cfg.queue_notify = *data;
249 			if (viornd_notifyq())
250 				*intr = 1;
251 			break;
252 		case VIRTIO_CONFIG_DEVICE_STATUS:
253 			viornd.cfg.device_status = *data;
254 			break;
255 		}
256 	} else {
257 		switch (reg) {
258 		case VIRTIO_CONFIG_DEVICE_FEATURES:
259 			*data = viornd.cfg.device_feature;
260 			break;
261 		case VIRTIO_CONFIG_GUEST_FEATURES:
262 			*data = viornd.cfg.guest_feature;
263 			break;
264 		case VIRTIO_CONFIG_QUEUE_PFN:
265 			*data = viornd.cfg.queue_pfn;
266 			break;
267 		case VIRTIO_CONFIG_QUEUE_SIZE:
268 			*data = viornd.cfg.queue_size;
269 			break;
270 		case VIRTIO_CONFIG_QUEUE_SELECT:
271 			*data = viornd.cfg.queue_select;
272 			break;
273 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
274 			*data = viornd.cfg.queue_notify;
275 			break;
276 		case VIRTIO_CONFIG_DEVICE_STATUS:
277 			*data = viornd.cfg.device_status;
278 			break;
279 		case VIRTIO_CONFIG_ISR_STATUS:
280 			*data = viornd.cfg.isr_status;
281 			viornd.cfg.isr_status = 0;
282 			vcpu_deassert_pic_irq(viornd.vm_id, 0, viornd.irq);
283 			break;
284 		}
285 	}
286 	return (0);
287 }
288 
289 void
290 vioblk_update_qa(struct vioblk_dev *dev)
291 {
292 	struct virtio_vq_info *vq_info;
293 	void *hva = NULL;
294 
295 	/* Invalid queue? */
296 	if (dev->cfg.queue_select > 0)
297 		return;
298 
299 	vq_info = &dev->vq[dev->cfg.queue_select];
300 	vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE;
301 
302 	hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIOBLK_QUEUE_SIZE));
303 	if (hva == NULL)
304 		fatal("vioblk_update_qa");
305 	vq_info->q_hva = hva;
306 }
307 
308 void
309 vioblk_update_qs(struct vioblk_dev *dev)
310 {
311 	struct virtio_vq_info *vq_info;
312 
313 	/* Invalid queue? */
314 	if (dev->cfg.queue_select > 0) {
315 		dev->cfg.queue_size = 0;
316 		return;
317 	}
318 
319 	vq_info = &dev->vq[dev->cfg.queue_select];
320 
321 	/* Update queue pfn/size based on queue select */
322 	dev->cfg.queue_pfn = vq_info->q_gpa >> 12;
323 	dev->cfg.queue_size = vq_info->qs;
324 }
325 
326 static void
327 vioblk_free_info(struct ioinfo *info)
328 {
329 	if (!info)
330 		return;
331 	free(info->buf);
332 	free(info);
333 }
334 
335 static struct ioinfo *
336 vioblk_start_read(struct vioblk_dev *dev, off_t sector, size_t sz)
337 {
338 	struct ioinfo *info;
339 
340 	/* Limit to 64M for now */
341 	if (sz > (1 << 26)) {
342 		log_warnx("%s: read size exceeded 64M", __func__);
343 		return (NULL);
344 	}
345 
346 	info = calloc(1, sizeof(*info));
347 	if (!info)
348 		goto nomem;
349 	info->buf = malloc(sz);
350 	if (info->buf == NULL)
351 		goto nomem;
352 	info->len = sz;
353 	info->offset = sector * VIRTIO_BLK_SECTOR_SIZE;
354 	info->file = &dev->file;
355 
356 	return info;
357 
358 nomem:
359 	free(info);
360 	log_warn("malloc error vioblk read");
361 	return (NULL);
362 }
363 
364 
365 static const uint8_t *
366 vioblk_finish_read(struct ioinfo *info)
367 {
368 	struct virtio_backing *file;
369 
370 	file = info->file;
371 	if (file->pread(file->p, info->buf, info->len, info->offset) != info->len) {
372 		info->error = errno;
373 		log_warn("vioblk read error");
374 		return NULL;
375 	}
376 
377 	return info->buf;
378 }
379 
380 static struct ioinfo *
381 vioblk_start_write(struct vioblk_dev *dev, off_t sector,
382     paddr_t addr, size_t len)
383 {
384 	struct ioinfo *info;
385 
386 	/* Limit to 64M for now */
387 	if (len > (1 << 26)) {
388 		log_warnx("%s: write size exceeded 64M", __func__);
389 		return (NULL);
390 	}
391 
392 	info = calloc(1, sizeof(*info));
393 	if (!info)
394 		goto nomem;
395 
396 	info->buf = malloc(len);
397 	if (info->buf == NULL)
398 		goto nomem;
399 	info->len = len;
400 	info->offset = sector * VIRTIO_BLK_SECTOR_SIZE;
401 	info->file = &dev->file;
402 
403 	if (read_mem(addr, info->buf, info->len)) {
404 		vioblk_free_info(info);
405 		return NULL;
406 	}
407 
408 	return info;
409 
410 nomem:
411 	free(info);
412 	log_warn("malloc error vioblk write");
413 	return (NULL);
414 }
415 
416 static int
417 vioblk_finish_write(struct ioinfo *info)
418 {
419 	struct virtio_backing *file;
420 
421 	file = info->file;
422 	if (file->pwrite(file->p, info->buf, info->len, info->offset) != info->len) {
423 		log_warn("vioblk write error");
424 		return EIO;
425 	}
426 	return 0;
427 }
428 
429 /*
430  * XXX in various cases, ds should be set to VIRTIO_BLK_S_IOERR, if we can
431  */
432 int
433 vioblk_notifyq(struct vioblk_dev *dev)
434 {
435 	uint16_t idx, cmd_desc_idx, secdata_desc_idx, ds_desc_idx;
436 	uint8_t ds;
437 	int cnt;
438 	off_t secbias;
439 	char *vr;
440 	struct vring_desc *desc, *cmd_desc, *secdata_desc, *ds_desc;
441 	struct vring_avail *avail;
442 	struct vring_used *used;
443 	struct virtio_blk_req_hdr cmd;
444 	struct virtio_vq_info *vq_info;
445 
446 	/* Invalid queue? */
447 	if (dev->cfg.queue_notify > 0)
448 		return (0);
449 
450 	vq_info = &dev->vq[dev->cfg.queue_notify];
451 	vr = vq_info->q_hva;
452 	if (vr == NULL)
453 		fatalx("%s: null vring", __func__);
454 
455 	/* Compute offsets in ring of descriptors, avail ring, and used ring */
456 	desc = (struct vring_desc *)(vr);
457 	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
458 	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
459 
460 	idx = vq_info->last_avail & VIOBLK_QUEUE_MASK;
461 
462 	if ((avail->idx & VIOBLK_QUEUE_MASK) == idx) {
463 		log_debug("%s - nothing to do?", __func__);
464 		return (0);
465 	}
466 
467 	while (idx != (avail->idx & VIOBLK_QUEUE_MASK)) {
468 
469 		ds = VIRTIO_BLK_S_IOERR;
470 		cmd_desc_idx = avail->ring[idx] & VIOBLK_QUEUE_MASK;
471 		cmd_desc = &desc[cmd_desc_idx];
472 
473 		if ((cmd_desc->flags & VRING_DESC_F_NEXT) == 0) {
474 			log_warnx("unchained vioblk cmd descriptor received "
475 			    "(idx %d)", cmd_desc_idx);
476 			goto out;
477 		}
478 
479 		/* Read command from descriptor ring */
480 		if (cmd_desc->flags & VRING_DESC_F_WRITE) {
481 			log_warnx("vioblk: unexpected writable cmd descriptor "
482 			    "%d", cmd_desc_idx);
483 			goto out;
484 		}
485 		if (read_mem(cmd_desc->addr, &cmd, sizeof(cmd))) {
486 			log_warnx("vioblk: command read_mem error @ 0x%llx",
487 			    cmd_desc->addr);
488 			goto out;
489 		}
490 
491 		switch (cmd.type) {
492 		case VIRTIO_BLK_T_IN:
493 			/* first descriptor */
494 			secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
495 			secdata_desc = &desc[secdata_desc_idx];
496 
497 			if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
498 				log_warnx("unchained vioblk data descriptor "
499 				    "received (idx %d)", cmd_desc_idx);
500 				goto out;
501 			}
502 
503 			cnt = 0;
504 			secbias = 0;
505 			do {
506 				struct ioinfo *info;
507 				const uint8_t *secdata;
508 
509 				if ((secdata_desc->flags & VRING_DESC_F_WRITE)
510 				    == 0) {
511 					log_warnx("vioblk: unwritable data "
512 					    "descriptor %d", secdata_desc_idx);
513 					goto out;
514 				}
515 
516 				info = vioblk_start_read(dev,
517 				    cmd.sector + secbias, secdata_desc->len);
518 
519 				if (info == NULL) {
520 					log_warnx("vioblk: can't start read");
521 					goto out;
522 				}
523 
524 				/* read the data, use current data descriptor */
525 				secdata = vioblk_finish_read(info);
526 				if (secdata == NULL) {
527 					vioblk_free_info(info);
528 					log_warnx("vioblk: block read error, "
529 					    "sector %lld", cmd.sector);
530 					goto out;
531 				}
532 
533 				if (write_mem(secdata_desc->addr, secdata,
534 					secdata_desc->len)) {
535 					log_warnx("can't write sector "
536 					    "data to gpa @ 0x%llx",
537 					    secdata_desc->addr);
538 					vioblk_free_info(info);
539 					goto out;
540 				}
541 
542 				vioblk_free_info(info);
543 
544 				secbias += (secdata_desc->len /
545 				    VIRTIO_BLK_SECTOR_SIZE);
546 				secdata_desc_idx = secdata_desc->next &
547 				    VIOBLK_QUEUE_MASK;
548 				secdata_desc = &desc[secdata_desc_idx];
549 
550 				/* Guard against infinite chains */
551 				if (++cnt >= VIOBLK_QUEUE_SIZE) {
552 					log_warnx("%s: descriptor table "
553 					    "invalid", __func__);
554 					goto out;
555 				}
556 			} while (secdata_desc->flags & VRING_DESC_F_NEXT);
557 
558 			ds_desc_idx = secdata_desc_idx;
559 			ds_desc = secdata_desc;
560 
561 			ds = VIRTIO_BLK_S_OK;
562 			break;
563 		case VIRTIO_BLK_T_OUT:
564 			secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
565 			secdata_desc = &desc[secdata_desc_idx];
566 
567 			if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
568 				log_warnx("wr vioblk: unchained vioblk data "
569 				    "descriptor received (idx %d)",
570 				    cmd_desc_idx);
571 				goto out;
572 			}
573 
574 			if (secdata_desc->len > dev->max_xfer) {
575 				log_warnx("%s: invalid read size %d requested",
576 				    __func__, secdata_desc->len);
577 				goto out;
578 			}
579 
580 			cnt = 0;
581 			secbias = 0;
582 			do {
583 				struct ioinfo *info;
584 
585 				if (secdata_desc->flags & VRING_DESC_F_WRITE) {
586 					log_warnx("wr vioblk: unexpected "
587 					    "writable data descriptor %d",
588 					    secdata_desc_idx);
589 					goto out;
590 				}
591 
592 				info = vioblk_start_write(dev,
593 				    cmd.sector + secbias,
594 				    secdata_desc->addr, secdata_desc->len);
595 
596 				if (info == NULL) {
597 					log_warnx("wr vioblk: can't read "
598 					    "sector data @ 0x%llx",
599 					    secdata_desc->addr);
600 					goto out;
601 				}
602 
603 				if (vioblk_finish_write(info)) {
604 					log_warnx("wr vioblk: disk write "
605 					    "error");
606 					vioblk_free_info(info);
607 					goto out;
608 				}
609 
610 				vioblk_free_info(info);
611 
612 				secbias += secdata_desc->len /
613 				    VIRTIO_BLK_SECTOR_SIZE;
614 
615 				secdata_desc_idx = secdata_desc->next &
616 				    VIOBLK_QUEUE_MASK;
617 				secdata_desc = &desc[secdata_desc_idx];
618 
619 				/* Guard against infinite chains */
620 				if (++cnt >= VIOBLK_QUEUE_SIZE) {
621 					log_warnx("%s: descriptor table "
622 					    "invalid", __func__);
623 					goto out;
624 				}
625 			} while (secdata_desc->flags & VRING_DESC_F_NEXT);
626 
627 			ds_desc_idx = secdata_desc_idx;
628 			ds_desc = secdata_desc;
629 
630 			ds = VIRTIO_BLK_S_OK;
631 			break;
632 		case VIRTIO_BLK_T_FLUSH:
633 		case VIRTIO_BLK_T_FLUSH_OUT:
634 			ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
635 			ds_desc = &desc[ds_desc_idx];
636 
637 			ds = VIRTIO_BLK_S_UNSUPP;
638 			break;
639 		case VIRTIO_BLK_T_GET_ID:
640 			secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
641 			secdata_desc = &desc[secdata_desc_idx];
642 
643 			/*
644 			 * We don't support this command yet. While it's not
645 			 * officially part of the virtio spec (will be in v1.2)
646 			 * there's no feature to negotiate. Linux drivers will
647 			 * often send this command regardless.
648 			 *
649 			 * When the command is received, it should appear as a
650 			 * chain of 3 descriptors, similar to the IN/OUT
651 			 * commands. The middle descriptor should have have a
652 			 * length of VIRTIO_BLK_ID_BYTES bytes.
653 			 */
654 			if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
655 				log_warnx("id vioblk: unchained vioblk data "
656 				    "descriptor received (idx %d)",
657 				    cmd_desc_idx);
658 				goto out;
659 			}
660 
661 			/* Skip the data descriptor. */
662 			ds_desc_idx = secdata_desc->next & VIOBLK_QUEUE_MASK;
663 			ds_desc = &desc[ds_desc_idx];
664 
665 			ds = VIRTIO_BLK_S_UNSUPP;
666 			break;
667 		default:
668 			log_warnx("%s: unsupported command 0x%x", __func__,
669 			    cmd.type);
670 			ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
671 			ds_desc = &desc[ds_desc_idx];
672 
673 			ds = VIRTIO_BLK_S_UNSUPP;
674 			break;
675 		}
676 
677 		if ((ds_desc->flags & VRING_DESC_F_WRITE) == 0) {
678 			log_warnx("%s: ds descriptor %d unwritable", __func__,
679 			    ds_desc_idx);
680 			goto out;
681 		}
682 		if (write_mem(ds_desc->addr, &ds, sizeof(ds))) {
683 			log_warnx("%s: can't write device status data @ 0x%llx",
684 			    __func__, ds_desc->addr);
685 			goto out;
686 		}
687 
688 		dev->cfg.isr_status = 1;
689 		used->ring[used->idx & VIOBLK_QUEUE_MASK].id = cmd_desc_idx;
690 		used->ring[used->idx & VIOBLK_QUEUE_MASK].len = cmd_desc->len;
691 		__sync_synchronize();
692 		used->idx++;
693 
694 		vq_info->last_avail = avail->idx & VIOBLK_QUEUE_MASK;
695 		idx = (idx + 1) & VIOBLK_QUEUE_MASK;
696 	}
697 out:
698 	return (1);
699 }
700 
701 int
702 virtio_blk_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
703     void *cookie, uint8_t sz)
704 {
705 	struct vioblk_dev *dev = (struct vioblk_dev *)cookie;
706 
707 	*intr = 0xFF;
708 
709 
710 	if (dir == 0) {
711 		switch (reg) {
712 		case VIRTIO_CONFIG_DEVICE_FEATURES:
713 		case VIRTIO_CONFIG_QUEUE_SIZE:
714 		case VIRTIO_CONFIG_ISR_STATUS:
715 			log_warnx("%s: illegal write %x to %s",
716 			    __progname, *data, virtio_reg_name(reg));
717 			break;
718 		case VIRTIO_CONFIG_GUEST_FEATURES:
719 			dev->cfg.guest_feature = *data;
720 			break;
721 		case VIRTIO_CONFIG_QUEUE_PFN:
722 			dev->cfg.queue_pfn = *data;
723 			vioblk_update_qa(dev);
724 			break;
725 		case VIRTIO_CONFIG_QUEUE_SELECT:
726 			dev->cfg.queue_select = *data;
727 			vioblk_update_qs(dev);
728 			break;
729 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
730 			dev->cfg.queue_notify = *data;
731 			if (vioblk_notifyq(dev))
732 				*intr = 1;
733 			break;
734 		case VIRTIO_CONFIG_DEVICE_STATUS:
735 			dev->cfg.device_status = *data;
736 			if (dev->cfg.device_status == 0) {
737 				log_debug("%s: device reset", __func__);
738 				dev->cfg.guest_feature = 0;
739 				dev->cfg.queue_pfn = 0;
740 				vioblk_update_qa(dev);
741 				dev->cfg.queue_size = 0;
742 				vioblk_update_qs(dev);
743 				dev->cfg.queue_select = 0;
744 				dev->cfg.queue_notify = 0;
745 				dev->cfg.isr_status = 0;
746 				dev->vq[0].last_avail = 0;
747 				vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
748 			}
749 			break;
750 		default:
751 			break;
752 		}
753 	} else {
754 		switch (reg) {
755 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
756 			switch (sz) {
757 			case 4:
758 				*data = (uint32_t)(dev->sz);
759 				break;
760 			case 2:
761 				*data &= 0xFFFF0000;
762 				*data |= (uint32_t)(dev->sz) & 0xFFFF;
763 				break;
764 			case 1:
765 				*data &= 0xFFFFFF00;
766 				*data |= (uint32_t)(dev->sz) & 0xFF;
767 				break;
768 			}
769 			/* XXX handle invalid sz */
770 			break;
771 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
772 			if (sz == 1) {
773 				*data &= 0xFFFFFF00;
774 				*data |= (uint32_t)(dev->sz >> 8) & 0xFF;
775 			}
776 			/* XXX handle invalid sz */
777 			break;
778 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
779 			if (sz == 1) {
780 				*data &= 0xFFFFFF00;
781 				*data |= (uint32_t)(dev->sz >> 16) & 0xFF;
782 			} else if (sz == 2) {
783 				*data &= 0xFFFF0000;
784 				*data |= (uint32_t)(dev->sz >> 16) & 0xFFFF;
785 			}
786 			/* XXX handle invalid sz */
787 			break;
788 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
789 			if (sz == 1) {
790 				*data &= 0xFFFFFF00;
791 				*data |= (uint32_t)(dev->sz >> 24) & 0xFF;
792 			}
793 			/* XXX handle invalid sz */
794 			break;
795 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
796 			switch (sz) {
797 			case 4:
798 				*data = (uint32_t)(dev->sz >> 32);
799 				break;
800 			case 2:
801 				*data &= 0xFFFF0000;
802 				*data |= (uint32_t)(dev->sz >> 32) & 0xFFFF;
803 				break;
804 			case 1:
805 				*data &= 0xFFFFFF00;
806 				*data |= (uint32_t)(dev->sz >> 32) & 0xFF;
807 				break;
808 			}
809 			/* XXX handle invalid sz */
810 			break;
811 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
812 			if (sz == 1) {
813 				*data &= 0xFFFFFF00;
814 				*data |= (uint32_t)(dev->sz >> 40) & 0xFF;
815 			}
816 			/* XXX handle invalid sz */
817 			break;
818 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 6:
819 			if (sz == 1) {
820 				*data &= 0xFFFFFF00;
821 				*data |= (uint32_t)(dev->sz >> 48) & 0xFF;
822 			} else if (sz == 2) {
823 				*data &= 0xFFFF0000;
824 				*data |= (uint32_t)(dev->sz >> 48) & 0xFFFF;
825 			}
826 			/* XXX handle invalid sz */
827 			break;
828 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 7:
829 			if (sz == 1) {
830 				*data &= 0xFFFFFF00;
831 				*data |= (uint32_t)(dev->sz >> 56) & 0xFF;
832 			}
833 			/* XXX handle invalid sz */
834 			break;
835 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
836 			switch (sz) {
837 			case 4:
838 				*data = (uint32_t)(dev->max_xfer);
839 				break;
840 			case 2:
841 				*data &= 0xFFFF0000;
842 				*data |= (uint32_t)(dev->max_xfer) & 0xFFFF;
843 				break;
844 			case 1:
845 				*data &= 0xFFFFFF00;
846 				*data |= (uint32_t)(dev->max_xfer) & 0xFF;
847 				break;
848 			}
849 			/* XXX handle invalid sz */
850 			break;
851 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 9:
852 			if (sz == 1) {
853 				*data &= 0xFFFFFF00;
854 				*data |= (uint32_t)(dev->max_xfer >> 8) & 0xFF;
855 			}
856 			/* XXX handle invalid sz */
857 			break;
858 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 10:
859 			if (sz == 1) {
860 				*data &= 0xFFFFFF00;
861 				*data |= (uint32_t)(dev->max_xfer >> 16) & 0xFF;
862 			} else if (sz == 2) {
863 				*data &= 0xFFFF0000;
864 				*data |= (uint32_t)(dev->max_xfer >> 16)
865 				    & 0xFFFF;
866 			}
867 			/* XXX handle invalid sz */
868 			break;
869 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 11:
870 			if (sz == 1) {
871 				*data &= 0xFFFFFF00;
872 				*data |= (uint32_t)(dev->max_xfer >> 24) & 0xFF;
873 			}
874 			/* XXX handle invalid sz */
875 			break;
876 		case VIRTIO_CONFIG_DEVICE_FEATURES:
877 			*data = dev->cfg.device_feature;
878 			break;
879 		case VIRTIO_CONFIG_GUEST_FEATURES:
880 			*data = dev->cfg.guest_feature;
881 			break;
882 		case VIRTIO_CONFIG_QUEUE_PFN:
883 			*data = dev->cfg.queue_pfn;
884 			break;
885 		case VIRTIO_CONFIG_QUEUE_SIZE:
886 			if (sz == 4)
887 				*data = dev->cfg.queue_size;
888 			else if (sz == 2) {
889 				*data &= 0xFFFF0000;
890 				*data |= (uint16_t)dev->cfg.queue_size;
891 			} else if (sz == 1) {
892 				*data &= 0xFFFFFF00;
893 				*data |= (uint8_t)dev->cfg.queue_size;
894 			}
895 			break;
896 		case VIRTIO_CONFIG_QUEUE_SELECT:
897 			*data = dev->cfg.queue_select;
898 			break;
899 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
900 			*data = dev->cfg.queue_notify;
901 			break;
902 		case VIRTIO_CONFIG_DEVICE_STATUS:
903 			if (sz == 4)
904 				*data = dev->cfg.device_status;
905 			else if (sz == 2) {
906 				*data &= 0xFFFF0000;
907 				*data |= (uint16_t)dev->cfg.device_status;
908 			} else if (sz == 1) {
909 				*data &= 0xFFFFFF00;
910 				*data |= (uint8_t)dev->cfg.device_status;
911 			}
912 			break;
913 		case VIRTIO_CONFIG_ISR_STATUS:
914 			*data = dev->cfg.isr_status;
915 			dev->cfg.isr_status = 0;
916 			vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
917 			break;
918 		}
919 	}
920 	return (0);
921 }
922 
923 int
924 virtio_net_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
925     void *cookie, uint8_t sz)
926 {
927 	struct vionet_dev *dev = (struct vionet_dev *)cookie;
928 
929 	*intr = 0xFF;
930 	mutex_lock(&dev->mutex);
931 
932 	if (dir == 0) {
933 		switch (reg) {
934 		case VIRTIO_CONFIG_DEVICE_FEATURES:
935 		case VIRTIO_CONFIG_QUEUE_SIZE:
936 		case VIRTIO_CONFIG_ISR_STATUS:
937 			log_warnx("%s: illegal write %x to %s",
938 			    __progname, *data, virtio_reg_name(reg));
939 			break;
940 		case VIRTIO_CONFIG_GUEST_FEATURES:
941 			dev->cfg.guest_feature = *data;
942 			break;
943 		case VIRTIO_CONFIG_QUEUE_PFN:
944 			dev->cfg.queue_pfn = *data;
945 			vionet_update_qa(dev);
946 			break;
947 		case VIRTIO_CONFIG_QUEUE_SELECT:
948 			dev->cfg.queue_select = *data;
949 			vionet_update_qs(dev);
950 			break;
951 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
952 			dev->cfg.queue_notify = *data;
953 			if (vionet_notifyq(dev))
954 				*intr = 1;
955 			break;
956 		case VIRTIO_CONFIG_DEVICE_STATUS:
957 			dev->cfg.device_status = *data;
958 			if (dev->cfg.device_status == 0) {
959 				log_debug("%s: device reset", __func__);
960 				dev->cfg.guest_feature = 0;
961 				dev->cfg.queue_pfn = 0;
962 				vionet_update_qa(dev);
963 				dev->cfg.queue_size = 0;
964 				vionet_update_qs(dev);
965 				dev->cfg.queue_select = 0;
966 				dev->cfg.queue_notify = 0;
967 				dev->cfg.isr_status = 0;
968 				dev->vq[RXQ].last_avail = 0;
969 				dev->vq[RXQ].notified_avail = 0;
970 				dev->vq[TXQ].last_avail = 0;
971 				dev->vq[TXQ].notified_avail = 0;
972 				vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
973 			}
974 			break;
975 		default:
976 			break;
977 		}
978 	} else {
979 		switch (reg) {
980 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
981 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
982 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
983 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
984 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
985 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
986 			*data = dev->mac[reg -
987 			    VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI];
988 			break;
989 		case VIRTIO_CONFIG_DEVICE_FEATURES:
990 			*data = dev->cfg.device_feature;
991 			break;
992 		case VIRTIO_CONFIG_GUEST_FEATURES:
993 			*data = dev->cfg.guest_feature;
994 			break;
995 		case VIRTIO_CONFIG_QUEUE_PFN:
996 			*data = dev->cfg.queue_pfn;
997 			break;
998 		case VIRTIO_CONFIG_QUEUE_SIZE:
999 			*data = dev->cfg.queue_size;
1000 			break;
1001 		case VIRTIO_CONFIG_QUEUE_SELECT:
1002 			*data = dev->cfg.queue_select;
1003 			break;
1004 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
1005 			*data = dev->cfg.queue_notify;
1006 			break;
1007 		case VIRTIO_CONFIG_DEVICE_STATUS:
1008 			*data = dev->cfg.device_status;
1009 			break;
1010 		case VIRTIO_CONFIG_ISR_STATUS:
1011 			*data = dev->cfg.isr_status;
1012 			dev->cfg.isr_status = 0;
1013 			vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq);
1014 			break;
1015 		}
1016 	}
1017 
1018 	mutex_unlock(&dev->mutex);
1019 	return (0);
1020 }
1021 
1022 /*
1023  * Must be called with dev->mutex acquired.
1024  */
1025 void
1026 vionet_update_qa(struct vionet_dev *dev)
1027 {
1028 	struct virtio_vq_info *vq_info;
1029 	void *hva = NULL;
1030 
1031 	/* Invalid queue? */
1032 	if (dev->cfg.queue_select > 1)
1033 		return;
1034 
1035 	vq_info = &dev->vq[dev->cfg.queue_select];
1036 	vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE;
1037 
1038 	hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIONET_QUEUE_SIZE));
1039 	if (hva == NULL)
1040 		fatal("vionet_update_qa");
1041 	vq_info->q_hva = hva;
1042 }
1043 
1044 /*
1045  * Must be called with dev->mutex acquired.
1046  */
1047 void
1048 vionet_update_qs(struct vionet_dev *dev)
1049 {
1050 	struct virtio_vq_info *vq_info;
1051 
1052 	/* Invalid queue? */
1053 	if (dev->cfg.queue_select > 1) {
1054 		dev->cfg.queue_size = 0;
1055 		return;
1056 	}
1057 
1058 	vq_info = &dev->vq[dev->cfg.queue_select];
1059 
1060 	/* Update queue pfn/size based on queue select */
1061 	dev->cfg.queue_pfn = vq_info->q_gpa >> 12;
1062 	dev->cfg.queue_size = vq_info->qs;
1063 }
1064 
1065 /*
1066  * vionet_enq_rx
1067  *
1068  * Take a given packet from the host-side tap and copy it into the guest's
1069  * buffers utilizing the rx virtio ring. If the packet length is invalid
1070  * (too small or too large) or if there are not enough buffers available,
1071  * the packet is dropped.
1072  *
1073  * Must be called with dev->mutex acquired.
1074  */
1075 int
1076 vionet_enq_rx(struct vionet_dev *dev, char *pkt, size_t sz, int *spc)
1077 {
1078 	uint16_t dxx, idx, hdr_desc_idx, chain_hdr_idx;
1079 	char *vr = NULL;
1080 	size_t bufsz = 0, off = 0, pkt_offset = 0, chunk_size = 0;
1081 	size_t chain_len = 0;
1082 	struct vring_desc *desc, *pkt_desc, *hdr_desc;
1083 	struct vring_avail *avail;
1084 	struct vring_used *used;
1085 	struct virtio_vq_info *vq_info;
1086 	struct virtio_net_hdr hdr;
1087 	size_t hdr_sz;
1088 
1089 	if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) {
1090 		log_warn("%s: invalid packet size", __func__);
1091 		return (0);
1092 	}
1093 
1094 	hdr_sz = sizeof(hdr);
1095 
1096 	if (!(dev->cfg.device_status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK))
1097 		return (0);
1098 
1099 	vq_info = &dev->vq[RXQ];
1100 	vr = vq_info->q_hva;
1101 	if (vr == NULL)
1102 		fatalx("%s: null vring", __func__);
1103 
1104 	/* Compute offsets in ring of descriptors, avail ring, and used ring */
1105 	desc = (struct vring_desc *)(vr);
1106 	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
1107 	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
1108 
1109 	idx = vq_info->last_avail & VIONET_QUEUE_MASK;
1110 	if ((vq_info->notified_avail & VIONET_QUEUE_MASK) == idx) {
1111 		log_debug("%s: insufficient available buffer capacity, "
1112 		    "dropping packet.", __func__);
1113 		return (0);
1114 	}
1115 
1116 	hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK;
1117 	hdr_desc = &desc[hdr_desc_idx];
1118 
1119 	dxx = hdr_desc_idx;
1120 	chain_hdr_idx = dxx;
1121 	chain_len = 0;
1122 
1123 	/* Process the descriptor and walk any potential chain. */
1124 	do {
1125 		off = 0;
1126 		pkt_desc = &desc[dxx];
1127 		if (!(pkt_desc->flags & VRING_DESC_F_WRITE)) {
1128 			log_warnx("%s: invalid descriptor, not writable",
1129 			    __func__);
1130 			return (0);
1131 		}
1132 
1133 		/* How much data do we get to write? */
1134 		if (sz - bufsz > pkt_desc->len)
1135 			chunk_size = pkt_desc->len;
1136 		else
1137 			chunk_size = sz - bufsz;
1138 
1139 		if (chain_len == 0) {
1140 			off = hdr_sz;
1141 			if (chunk_size == pkt_desc->len)
1142 				chunk_size -= off;
1143 		}
1144 
1145 		/* Write a chunk of data if we need to */
1146 		if (chunk_size && write_mem(pkt_desc->addr + off,
1147 			pkt + pkt_offset, chunk_size)) {
1148 			log_warnx("%s: failed to write to buffer 0x%llx",
1149 			    __func__, pkt_desc->addr);
1150 			return (0);
1151 		}
1152 
1153 		chain_len += chunk_size + off;
1154 		bufsz += chunk_size;
1155 		pkt_offset += chunk_size;
1156 
1157 		dxx = pkt_desc->next & VIONET_QUEUE_MASK;
1158 	} while (bufsz < sz && pkt_desc->flags & VRING_DESC_F_NEXT);
1159 
1160 	/* Move our marker in the ring...*/
1161 	vq_info->last_avail = (vq_info->last_avail + 1) &
1162 	    VIONET_QUEUE_MASK;
1163 
1164 	/* Prepend the virtio net header in the first buffer. */
1165 	memset(&hdr, 0, sizeof(hdr));
1166 	hdr.hdr_len = hdr_sz;
1167 	if (write_mem(hdr_desc->addr, &hdr, hdr_sz)) {
1168 	    log_warnx("vionet: rx enq header write_mem error @ 0x%llx",
1169 		hdr_desc->addr);
1170 	    return (0);
1171 	}
1172 
1173 	/* Update the index field in the used ring. This must be done last. */
1174 	dev->cfg.isr_status = 1;
1175 	*spc = (vq_info->notified_avail - vq_info->last_avail)
1176 	    & VIONET_QUEUE_MASK;
1177 
1178 	/* Update the list of used buffers. */
1179 	used->ring[used->idx & VIONET_QUEUE_MASK].id = chain_hdr_idx;
1180 	used->ring[used->idx & VIONET_QUEUE_MASK].len = chain_len;
1181 	__sync_synchronize();
1182 	used->idx++;
1183 
1184 	return (1);
1185 }
1186 
1187 /*
1188  * vionet_rx
1189  *
1190  * Enqueue data that was received on a tap file descriptor
1191  * to the vionet device queue.
1192  *
1193  * Must be called with dev->mutex acquired.
1194  */
1195 static int
1196 vionet_rx(struct vionet_dev *dev)
1197 {
1198 	char buf[PAGE_SIZE];
1199 	int num_enq = 0, spc = 0;
1200 	struct ether_header *eh;
1201 	ssize_t sz;
1202 
1203 	do {
1204 		sz = read(dev->fd, buf, sizeof(buf));
1205 		if (sz == -1) {
1206 			/*
1207 			 * If we get EAGAIN, No data is currently available.
1208 			 * Do not treat this as an error.
1209 			 */
1210 			if (errno != EAGAIN)
1211 				log_warn("unexpected read error on vionet "
1212 				    "device");
1213 		} else if (sz > 0) {
1214 			eh = (struct ether_header *)buf;
1215 			if (!dev->lockedmac ||
1216 			    ETHER_IS_MULTICAST(eh->ether_dhost) ||
1217 			    memcmp(eh->ether_dhost, dev->mac,
1218 			    sizeof(eh->ether_dhost)) == 0)
1219 				num_enq += vionet_enq_rx(dev, buf, sz, &spc);
1220 		} else if (sz == 0) {
1221 			log_debug("process_rx: no data");
1222 			break;
1223 		}
1224 	} while (spc > 0 && sz > 0);
1225 
1226 	return (num_enq);
1227 }
1228 
1229 /*
1230  * vionet_rx_event
1231  *
1232  * Called from the event handling thread when new data can be
1233  * received on the tap fd of a vionet device.
1234  */
1235 static void
1236 vionet_rx_event(int fd, short kind, void *arg)
1237 {
1238 	struct vionet_dev *dev = arg;
1239 
1240 	mutex_lock(&dev->mutex);
1241 
1242 	if (vionet_rx(dev) > 0) {
1243 		/* XXX: vcpu_id */
1244 		vcpu_assert_pic_irq(dev->vm_id, 0, dev->irq);
1245 	}
1246 
1247 	mutex_unlock(&dev->mutex);
1248 }
1249 
1250 /*
1251  * Must be called with dev->mutex acquired.
1252  */
1253 void
1254 vionet_notify_rx(struct vionet_dev *dev)
1255 {
1256 	char *vr;
1257 	struct vring_avail *avail;
1258 	struct virtio_vq_info *vq_info;
1259 
1260 	vq_info = &dev->vq[RXQ];
1261 	vr = vq_info->q_hva;
1262 	if (vr == NULL)
1263 		fatalx("%s: null vring", __func__);
1264 
1265 	/* Compute offset into avail ring */
1266 	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
1267 	vq_info->notified_avail = avail->idx - 1;
1268 }
1269 
1270 /*
1271  * Must be called with dev->mutex acquired.
1272  */
1273 int
1274 vionet_notifyq(struct vionet_dev *dev)
1275 {
1276 	int ret = 0;
1277 
1278 	switch (dev->cfg.queue_notify) {
1279 	case RXQ:
1280 		vionet_notify_rx(dev);
1281 		break;
1282 	case TXQ:
1283 		ret = vionet_notify_tx(dev);
1284 		break;
1285 	default:
1286 		/*
1287 		 * Catch the unimplemented queue ID 2 (control queue) as
1288 		 * well as any bogus queue IDs.
1289 		 */
1290 		log_debug("%s: notify for unimplemented queue ID %d",
1291 		    __func__, dev->cfg.queue_notify);
1292 		break;
1293 	}
1294 
1295 	return (ret);
1296 }
1297 
1298 /*
1299  * Must be called with dev->mutex acquired.
1300  */
1301 int
1302 vionet_notify_tx(struct vionet_dev *dev)
1303 {
1304 	uint16_t idx, pkt_desc_idx, hdr_desc_idx, dxx, cnt;
1305 	size_t pktsz, chunk_size = 0;
1306 	ssize_t dhcpsz = 0;
1307 	int num_enq, ofs, spc = 0;
1308 	char *vr = NULL, *pkt = NULL, *dhcppkt = NULL;
1309 	struct vring_desc *desc, *pkt_desc, *hdr_desc;
1310 	struct vring_avail *avail;
1311 	struct vring_used *used;
1312 	struct virtio_vq_info *vq_info;
1313 	struct ether_header *eh;
1314 
1315 	vq_info = &dev->vq[TXQ];
1316 	vr = vq_info->q_hva;
1317 	if (vr == NULL)
1318 		fatalx("%s: null vring", __func__);
1319 
1320 	/* Compute offsets in ring of descriptors, avail ring, and used ring */
1321 	desc = (struct vring_desc *)(vr);
1322 	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
1323 	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
1324 
1325 	num_enq = 0;
1326 
1327 	idx = vq_info->last_avail & VIONET_QUEUE_MASK;
1328 
1329 	if ((avail->idx & VIONET_QUEUE_MASK) == idx) {
1330 		log_debug("%s - nothing to do?", __func__);
1331 		return (0);
1332 	}
1333 
1334 	while ((avail->idx & VIONET_QUEUE_MASK) != idx) {
1335 		hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK;
1336 		hdr_desc = &desc[hdr_desc_idx];
1337 		pktsz = 0;
1338 
1339 		cnt = 0;
1340 		dxx = hdr_desc_idx;
1341 		do {
1342 			pktsz += desc[dxx].len;
1343 			dxx = desc[dxx].next & VIONET_QUEUE_MASK;
1344 
1345 			/*
1346 			 * Virtio 1.0, cs04, section 2.4.5:
1347 			 *  "The number of descriptors in the table is defined
1348 			 *   by the queue size for this virtqueue: this is the
1349 			 *   maximum possible descriptor chain length."
1350 			 */
1351 			if (++cnt >= VIONET_QUEUE_SIZE) {
1352 				log_warnx("%s: descriptor table invalid",
1353 				    __func__);
1354 				goto out;
1355 			}
1356 		} while (desc[dxx].flags & VRING_DESC_F_NEXT);
1357 
1358 		pktsz += desc[dxx].len;
1359 
1360 		/* Remove virtio header descriptor len */
1361 		pktsz -= hdr_desc->len;
1362 
1363 		/* Drop packets violating device MTU-based limits */
1364 		if (pktsz < VIONET_MIN_TXLEN || pktsz > VIONET_MAX_TXLEN) {
1365 			log_warnx("%s: invalid packet size %lu", __func__,
1366 			    pktsz);
1367 			goto drop_packet;
1368 		}
1369 		pkt = malloc(pktsz);
1370 		if (pkt == NULL) {
1371 			log_warn("malloc error alloc packet buf");
1372 			goto out;
1373 		}
1374 
1375 		ofs = 0;
1376 		pkt_desc_idx = hdr_desc->next & VIONET_QUEUE_MASK;
1377 		pkt_desc = &desc[pkt_desc_idx];
1378 
1379 		while (pkt_desc->flags & VRING_DESC_F_NEXT) {
1380 			/* must be not writable */
1381 			if (pkt_desc->flags & VRING_DESC_F_WRITE) {
1382 				log_warnx("unexpected writable tx desc "
1383 				    "%d", pkt_desc_idx);
1384 				goto out;
1385 			}
1386 
1387 			/* Check we don't read beyond allocated pktsz */
1388 			if (pkt_desc->len > pktsz - ofs) {
1389 				log_warnx("%s: descriptor len past pkt len",
1390 				    __func__);
1391 				chunk_size = pktsz - ofs;
1392 			} else
1393 				chunk_size = pkt_desc->len;
1394 
1395 			/* Read packet from descriptor ring */
1396 			if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) {
1397 				log_warnx("vionet: packet read_mem error "
1398 				    "@ 0x%llx", pkt_desc->addr);
1399 				goto out;
1400 			}
1401 
1402 			ofs += pkt_desc->len;
1403 			pkt_desc_idx = pkt_desc->next & VIONET_QUEUE_MASK;
1404 			pkt_desc = &desc[pkt_desc_idx];
1405 		}
1406 
1407 		/* Now handle tail descriptor - must be not writable */
1408 		if (pkt_desc->flags & VRING_DESC_F_WRITE) {
1409 			log_warnx("unexpected writable tx descriptor %d",
1410 			    pkt_desc_idx);
1411 			goto out;
1412 		}
1413 
1414 		/* Check we don't read beyond allocated pktsz */
1415 		if (pkt_desc->len > pktsz - ofs) {
1416 			log_warnx("%s: descriptor len past pkt len", __func__);
1417 			chunk_size = pktsz - ofs - pkt_desc->len;
1418 		} else
1419 			chunk_size = pkt_desc->len;
1420 
1421 		/* Read packet from descriptor ring */
1422 		if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) {
1423 			log_warnx("vionet: packet read_mem error @ "
1424 			    "0x%llx", pkt_desc->addr);
1425 			goto out;
1426 		}
1427 
1428 		/* reject other source addresses */
1429 		if (dev->lockedmac && pktsz >= ETHER_HDR_LEN &&
1430 		    (eh = (struct ether_header *)pkt) &&
1431 		    memcmp(eh->ether_shost, dev->mac,
1432 		    sizeof(eh->ether_shost)) != 0)
1433 			log_debug("vionet: wrong source address %s for vm %d",
1434 			    ether_ntoa((struct ether_addr *)
1435 			    eh->ether_shost), dev->vm_id);
1436 		else if (dev->local &&
1437 		    (dhcpsz = dhcp_request(dev, pkt, pktsz, &dhcppkt)) != -1) {
1438 			log_debug("vionet: dhcp request,"
1439 			    " local response size %zd", dhcpsz);
1440 
1441 		/* XXX signed vs unsigned here, funky cast */
1442 		} else if (write(dev->fd, pkt, pktsz) != (int)pktsz) {
1443 			log_warnx("vionet: tx failed writing to tap: "
1444 			    "%d", errno);
1445 			goto out;
1446 		}
1447 
1448 	drop_packet:
1449 		dev->cfg.isr_status = 1;
1450 		used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_desc_idx;
1451 		used->ring[used->idx & VIONET_QUEUE_MASK].len = hdr_desc->len;
1452 		__sync_synchronize();
1453 		used->idx++;
1454 
1455 		vq_info->last_avail = avail->idx & VIONET_QUEUE_MASK;
1456 		idx = (idx + 1) & VIONET_QUEUE_MASK;
1457 
1458 		num_enq++;
1459 
1460 		free(pkt);
1461 		pkt = NULL;
1462 	}
1463 
1464 	if (dhcpsz > 0)
1465 		vionet_enq_rx(dev, dhcppkt, dhcpsz, &spc);
1466 
1467 out:
1468 	free(pkt);
1469 	free(dhcppkt);
1470 
1471 	return (1);
1472 }
1473 
1474 int
1475 vmmci_ctl(unsigned int cmd)
1476 {
1477 	struct timeval tv = { 0, 0 };
1478 
1479 	if ((vmmci.cfg.device_status &
1480 	    VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0)
1481 		return (-1);
1482 
1483 	if (cmd == vmmci.cmd)
1484 		return (0);
1485 
1486 	switch (cmd) {
1487 	case VMMCI_NONE:
1488 		break;
1489 	case VMMCI_SHUTDOWN:
1490 	case VMMCI_REBOOT:
1491 		/* Update command */
1492 		vmmci.cmd = cmd;
1493 
1494 		/*
1495 		 * vmm VMs do not support powerdown, send a reboot request
1496 		 * instead and turn it off after the triple fault.
1497 		 */
1498 		if (cmd == VMMCI_SHUTDOWN)
1499 			cmd = VMMCI_REBOOT;
1500 
1501 		/* Trigger interrupt */
1502 		vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
1503 		vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
1504 
1505 		/* Add ACK timeout */
1506 		tv.tv_sec = VMMCI_TIMEOUT;
1507 		evtimer_add(&vmmci.timeout, &tv);
1508 		break;
1509 	case VMMCI_SYNCRTC:
1510 		if (vmmci.cfg.guest_feature & VMMCI_F_SYNCRTC) {
1511 			/* RTC updated, request guest VM resync of its RTC */
1512 			vmmci.cmd = cmd;
1513 
1514 			vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
1515 			vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
1516 		} else {
1517 			log_debug("%s: RTC sync skipped (guest does not "
1518 			    "support RTC sync)\n", __func__);
1519 		}
1520 		break;
1521 	default:
1522 		fatalx("invalid vmmci command: %d", cmd);
1523 	}
1524 
1525 	return (0);
1526 }
1527 
1528 void
1529 vmmci_ack(unsigned int cmd)
1530 {
1531 	struct timeval	 tv = { 0, 0 };
1532 
1533 	switch (cmd) {
1534 	case VMMCI_NONE:
1535 		break;
1536 	case VMMCI_SHUTDOWN:
1537 		/*
1538 		 * The shutdown was requested by the VM if we don't have
1539 		 * a pending shutdown request.  In this case add a short
1540 		 * timeout to give the VM a chance to reboot before the
1541 		 * timer is expired.
1542 		 */
1543 		if (vmmci.cmd == 0) {
1544 			log_debug("%s: vm %u requested shutdown", __func__,
1545 			    vmmci.vm_id);
1546 			tv.tv_sec = VMMCI_TIMEOUT;
1547 			evtimer_add(&vmmci.timeout, &tv);
1548 			return;
1549 		}
1550 		/* FALLTHROUGH */
1551 	case VMMCI_REBOOT:
1552 		/*
1553 		 * If the VM acknowledged our shutdown request, give it
1554 		 * enough time to shutdown or reboot gracefully.  This
1555 		 * might take a considerable amount of time (running
1556 		 * rc.shutdown on the VM), so increase the timeout before
1557 		 * killing it forcefully.
1558 		 */
1559 		if (cmd == vmmci.cmd &&
1560 		    evtimer_pending(&vmmci.timeout, NULL)) {
1561 			log_debug("%s: vm %u acknowledged shutdown request",
1562 			    __func__, vmmci.vm_id);
1563 			tv.tv_sec = VMMCI_SHUTDOWN_TIMEOUT;
1564 			evtimer_add(&vmmci.timeout, &tv);
1565 		}
1566 		break;
1567 	case VMMCI_SYNCRTC:
1568 		log_debug("%s: vm %u acknowledged RTC sync request",
1569 		    __func__, vmmci.vm_id);
1570 		vmmci.cmd = VMMCI_NONE;
1571 		break;
1572 	default:
1573 		log_warnx("%s: illegal request %u", __func__, cmd);
1574 		break;
1575 	}
1576 }
1577 
1578 void
1579 vmmci_timeout(int fd, short type, void *arg)
1580 {
1581 	log_debug("%s: vm %u shutdown", __progname, vmmci.vm_id);
1582 	vm_shutdown(vmmci.cmd == VMMCI_REBOOT ? VMMCI_REBOOT : VMMCI_SHUTDOWN);
1583 }
1584 
1585 int
1586 vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
1587     void *unused, uint8_t sz)
1588 {
1589 	*intr = 0xFF;
1590 
1591 	if (dir == 0) {
1592 		switch (reg) {
1593 		case VIRTIO_CONFIG_DEVICE_FEATURES:
1594 		case VIRTIO_CONFIG_QUEUE_SIZE:
1595 		case VIRTIO_CONFIG_ISR_STATUS:
1596 			log_warnx("%s: illegal write %x to %s",
1597 			    __progname, *data, virtio_reg_name(reg));
1598 			break;
1599 		case VIRTIO_CONFIG_GUEST_FEATURES:
1600 			vmmci.cfg.guest_feature = *data;
1601 			break;
1602 		case VIRTIO_CONFIG_QUEUE_PFN:
1603 			vmmci.cfg.queue_pfn = *data;
1604 			break;
1605 		case VIRTIO_CONFIG_QUEUE_SELECT:
1606 			vmmci.cfg.queue_select = *data;
1607 			break;
1608 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
1609 			vmmci.cfg.queue_notify = *data;
1610 			break;
1611 		case VIRTIO_CONFIG_DEVICE_STATUS:
1612 			vmmci.cfg.device_status = *data;
1613 			break;
1614 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
1615 			vmmci_ack(*data);
1616 			break;
1617 		}
1618 	} else {
1619 		switch (reg) {
1620 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
1621 			*data = vmmci.cmd;
1622 			break;
1623 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
1624 			/* Update time once when reading the first register */
1625 			gettimeofday(&vmmci.time, NULL);
1626 			*data = (uint64_t)vmmci.time.tv_sec;
1627 			break;
1628 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
1629 			*data = (uint64_t)vmmci.time.tv_sec << 32;
1630 			break;
1631 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12:
1632 			*data = (uint64_t)vmmci.time.tv_usec;
1633 			break;
1634 		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16:
1635 			*data = (uint64_t)vmmci.time.tv_usec << 32;
1636 			break;
1637 		case VIRTIO_CONFIG_DEVICE_FEATURES:
1638 			*data = vmmci.cfg.device_feature;
1639 			break;
1640 		case VIRTIO_CONFIG_GUEST_FEATURES:
1641 			*data = vmmci.cfg.guest_feature;
1642 			break;
1643 		case VIRTIO_CONFIG_QUEUE_PFN:
1644 			*data = vmmci.cfg.queue_pfn;
1645 			break;
1646 		case VIRTIO_CONFIG_QUEUE_SIZE:
1647 			*data = vmmci.cfg.queue_size;
1648 			break;
1649 		case VIRTIO_CONFIG_QUEUE_SELECT:
1650 			*data = vmmci.cfg.queue_select;
1651 			break;
1652 		case VIRTIO_CONFIG_QUEUE_NOTIFY:
1653 			*data = vmmci.cfg.queue_notify;
1654 			break;
1655 		case VIRTIO_CONFIG_DEVICE_STATUS:
1656 			*data = vmmci.cfg.device_status;
1657 			break;
1658 		case VIRTIO_CONFIG_ISR_STATUS:
1659 			*data = vmmci.cfg.isr_status;
1660 			vmmci.cfg.isr_status = 0;
1661 			vcpu_deassert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
1662 			break;
1663 		}
1664 	}
1665 	return (0);
1666 }
1667 
1668 int
1669 virtio_get_base(int fd, char *path, size_t npath, int type, const char *dpath)
1670 {
1671 	switch (type) {
1672 	case VMDF_RAW:
1673 		return 0;
1674 	case VMDF_QCOW2:
1675 		return virtio_qcow2_get_base(fd, path, npath, dpath);
1676 	}
1677 	log_warnx("%s: invalid disk format", __func__);
1678 	return -1;
1679 }
1680 
1681 /*
1682  * Initializes a struct virtio_backing using the list of fds.
1683  */
1684 static int
1685 virtio_init_disk(struct virtio_backing *file, off_t *sz,
1686     int *fd, size_t nfd, int type)
1687 {
1688 	/*
1689 	 * probe disk types in order of preference, first one to work wins.
1690 	 * TODO: provide a way of specifying the type and options.
1691 	 */
1692 	switch (type) {
1693 	case VMDF_RAW:
1694 		return virtio_raw_init(file, sz, fd, nfd);
1695 	case VMDF_QCOW2:
1696 		return virtio_qcow2_init(file, sz, fd, nfd);
1697 	}
1698 	log_warnx("%s: invalid disk format", __func__);
1699 	return -1;
1700 }
1701 
1702 void
1703 virtio_init(struct vmd_vm *vm, int child_cdrom,
1704     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1705 {
1706 	struct vmop_create_params *vmc = &vm->vm_params;
1707 	struct vm_create_params *vcp = &vmc->vmc_params;
1708 	uint8_t id;
1709 	uint8_t i;
1710 	int ret;
1711 
1712 	/* Virtio entropy device */
1713 	if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
1714 	    PCI_PRODUCT_QUMRANET_VIO_RNG, PCI_CLASS_SYSTEM,
1715 	    PCI_SUBCLASS_SYSTEM_MISC,
1716 	    PCI_VENDOR_OPENBSD,
1717 	    PCI_PRODUCT_VIRTIO_ENTROPY, 1, NULL)) {
1718 		log_warnx("%s: can't add PCI virtio rng device",
1719 		    __progname);
1720 		return;
1721 	}
1722 
1723 	if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_rnd_io, NULL)) {
1724 		log_warnx("%s: can't add bar for virtio rng device",
1725 		    __progname);
1726 		return;
1727 	}
1728 
1729 	memset(&viornd, 0, sizeof(viornd));
1730 	viornd.vq[0].qs = VIORND_QUEUE_SIZE;
1731 	viornd.vq[0].vq_availoffset = sizeof(struct vring_desc) *
1732 	    VIORND_QUEUE_SIZE;
1733 	viornd.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
1734 	    sizeof(struct vring_desc) * VIORND_QUEUE_SIZE
1735 	    + sizeof(uint16_t) * (2 + VIORND_QUEUE_SIZE));
1736 	viornd.pci_id = id;
1737 	viornd.irq = pci_get_dev_irq(id);
1738 	viornd.vm_id = vcp->vcp_id;
1739 
1740 	if (vcp->vcp_nnics > 0) {
1741 		vionet = calloc(vcp->vcp_nnics, sizeof(struct vionet_dev));
1742 		if (vionet == NULL) {
1743 			log_warn("%s: calloc failure allocating vionets",
1744 			    __progname);
1745 			return;
1746 		}
1747 
1748 		nr_vionet = vcp->vcp_nnics;
1749 		/* Virtio network */
1750 		for (i = 0; i < vcp->vcp_nnics; i++) {
1751 			if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
1752 			    PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM,
1753 			    PCI_SUBCLASS_SYSTEM_MISC,
1754 			    PCI_VENDOR_OPENBSD,
1755 			    PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL)) {
1756 				log_warnx("%s: can't add PCI virtio net device",
1757 				    __progname);
1758 				return;
1759 			}
1760 
1761 			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_net_io,
1762 			    &vionet[i])) {
1763 				log_warnx("%s: can't add bar for virtio net "
1764 				    "device", __progname);
1765 				return;
1766 			}
1767 
1768 			ret = pthread_mutex_init(&vionet[i].mutex, NULL);
1769 			if (ret) {
1770 				errno = ret;
1771 				log_warn("%s: could not initialize mutex "
1772 				    "for vionet device", __progname);
1773 				return;
1774 			}
1775 
1776 			vionet[i].vq[RXQ].qs = VIONET_QUEUE_SIZE;
1777 			vionet[i].vq[RXQ].vq_availoffset =
1778 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
1779 			vionet[i].vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
1780 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
1781 			    + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
1782 			vionet[i].vq[RXQ].last_avail = 0;
1783 			vionet[i].vq[RXQ].notified_avail = 0;
1784 
1785 			vionet[i].vq[TXQ].qs = VIONET_QUEUE_SIZE;
1786 			vionet[i].vq[TXQ].vq_availoffset =
1787 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
1788 			vionet[i].vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
1789 			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
1790 			    + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
1791 			vionet[i].vq[TXQ].last_avail = 0;
1792 			vionet[i].vq[TXQ].notified_avail = 0;
1793 			vionet[i].fd = child_taps[i];
1794 			vionet[i].vm_id = vcp->vcp_id;
1795 			vionet[i].vm_vmid = vm->vm_vmid;
1796 			vionet[i].irq = pci_get_dev_irq(id);
1797 
1798 			event_set(&vionet[i].event, vionet[i].fd,
1799 			    EV_READ | EV_PERSIST, vionet_rx_event, &vionet[i]);
1800 			if (event_add(&vionet[i].event, NULL)) {
1801 				log_warn("could not initialize vionet event "
1802 				    "handler");
1803 				return;
1804 			}
1805 
1806 			/* MAC address has been assigned by the parent */
1807 			memcpy(&vionet[i].mac, &vcp->vcp_macs[i], 6);
1808 			vionet[i].cfg.device_feature = VIRTIO_NET_F_MAC;
1809 
1810 			vionet[i].lockedmac =
1811 			    vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0;
1812 			vionet[i].local =
1813 			    vmc->vmc_ifflags[i] & VMIFF_LOCAL ? 1 : 0;
1814 			if (i == 0 && vmc->vmc_bootdevice & VMBOOTDEV_NET)
1815 				vionet[i].pxeboot = 1;
1816 			vionet[i].idx = i;
1817 			vionet[i].pci_id = id;
1818 
1819 			log_debug("%s: vm \"%s\" vio%u lladdr %s%s%s%s",
1820 			    __func__, vcp->vcp_name, i,
1821 			    ether_ntoa((void *)vionet[i].mac),
1822 			    vionet[i].lockedmac ? ", locked" : "",
1823 			    vionet[i].local ? ", local" : "",
1824 			    vionet[i].pxeboot ? ", pxeboot" : "");
1825 		}
1826 	}
1827 
1828 	if (vcp->vcp_ndisks > 0) {
1829 		nr_vioblk = vcp->vcp_ndisks;
1830 		vioblk = calloc(vcp->vcp_ndisks, sizeof(struct vioblk_dev));
1831 		if (vioblk == NULL) {
1832 			log_warn("%s: calloc failure allocating vioblks",
1833 			    __progname);
1834 			return;
1835 		}
1836 
1837 		/* One virtio block device for each disk defined in vcp */
1838 		for (i = 0; i < vcp->vcp_ndisks; i++) {
1839 			if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
1840 			    PCI_PRODUCT_QUMRANET_VIO_BLOCK,
1841 			    PCI_CLASS_MASS_STORAGE,
1842 			    PCI_SUBCLASS_MASS_STORAGE_SCSI,
1843 			    PCI_VENDOR_OPENBSD,
1844 			    PCI_PRODUCT_VIRTIO_BLOCK, 1, NULL)) {
1845 				log_warnx("%s: can't add PCI virtio block "
1846 				    "device", __progname);
1847 				return;
1848 			}
1849 			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_blk_io,
1850 			    &vioblk[i])) {
1851 				log_warnx("%s: can't add bar for virtio block "
1852 				    "device", __progname);
1853 				return;
1854 			}
1855 			vioblk[i].vq[0].qs = VIOBLK_QUEUE_SIZE;
1856 			vioblk[i].vq[0].vq_availoffset =
1857 			    sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE;
1858 			vioblk[i].vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
1859 			    sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE
1860 			    + sizeof(uint16_t) * (2 + VIOBLK_QUEUE_SIZE));
1861 			vioblk[i].vq[0].last_avail = 0;
1862 			vioblk[i].cfg.device_feature = VIRTIO_BLK_F_SIZE_MAX;
1863 			vioblk[i].max_xfer = 1048576;
1864 			vioblk[i].pci_id = id;
1865 			vioblk[i].vm_id = vcp->vcp_id;
1866 			vioblk[i].irq = pci_get_dev_irq(id);
1867 			if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
1868 			    child_disks[i], vmc->vmc_diskbases[i],
1869 			    vmc->vmc_disktypes[i]) == -1) {
1870 				log_warnx("%s: unable to determine disk format",
1871 				    __func__);
1872 				return;
1873 			}
1874 			vioblk[i].sz /= 512;
1875 		}
1876 	}
1877 
1878 	/* vioscsi cdrom */
1879 	if (strlen(vcp->vcp_cdrom)) {
1880 		vioscsi = calloc(1, sizeof(struct vioscsi_dev));
1881 		if (vioscsi == NULL) {
1882 			log_warn("%s: calloc failure allocating vioscsi",
1883 			    __progname);
1884 			return;
1885 		}
1886 
1887 		if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
1888 		    PCI_PRODUCT_QUMRANET_VIO_SCSI,
1889 		    PCI_CLASS_MASS_STORAGE,
1890 		    PCI_SUBCLASS_MASS_STORAGE_SCSI,
1891 		    PCI_VENDOR_OPENBSD,
1892 		    PCI_PRODUCT_VIRTIO_SCSI, 1, NULL)) {
1893 			log_warnx("%s: can't add PCI vioscsi device",
1894 			    __progname);
1895 			return;
1896 		}
1897 
1898 		if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vioscsi_io, vioscsi)) {
1899 			log_warnx("%s: can't add bar for vioscsi device",
1900 			    __progname);
1901 			return;
1902 		}
1903 
1904 		for ( i = 0; i < VIRTIO_MAX_QUEUES; i++) {
1905 			vioscsi->vq[i].qs = VIOSCSI_QUEUE_SIZE;
1906 			vioscsi->vq[i].vq_availoffset =
1907 			    sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE;
1908 			vioscsi->vq[i].vq_usedoffset = VIRTQUEUE_ALIGN(
1909 			    sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE
1910 			    + sizeof(uint16_t) * (2 + VIOSCSI_QUEUE_SIZE));
1911 			vioscsi->vq[i].last_avail = 0;
1912 		}
1913 		if (virtio_init_disk(&vioscsi->file, &vioscsi->sz,
1914 		    &child_cdrom, 1, VMDF_RAW) == -1) {
1915 			log_warnx("%s: unable to determine iso format",
1916 			    __func__);
1917 			return;
1918 		}
1919 		vioscsi->locked = 0;
1920 		vioscsi->lba = 0;
1921 		vioscsi->n_blocks = vioscsi->sz >> 11; /* num of 2048 blocks in file */
1922 		vioscsi->max_xfer = VIOSCSI_BLOCK_SIZE_CDROM;
1923 		vioscsi->pci_id = id;
1924 		vioscsi->vm_id = vcp->vcp_id;
1925 		vioscsi->irq = pci_get_dev_irq(id);
1926 	}
1927 
1928 	/* virtio control device */
1929 	if (pci_add_device(&id, PCI_VENDOR_OPENBSD,
1930 	    PCI_PRODUCT_OPENBSD_CONTROL,
1931 	    PCI_CLASS_COMMUNICATIONS,
1932 	    PCI_SUBCLASS_COMMUNICATIONS_MISC,
1933 	    PCI_VENDOR_OPENBSD,
1934 	    PCI_PRODUCT_VIRTIO_VMMCI, 1, NULL)) {
1935 		log_warnx("%s: can't add PCI vmm control device",
1936 		    __progname);
1937 		return;
1938 	}
1939 
1940 	if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vmmci_io, NULL)) {
1941 		log_warnx("%s: can't add bar for vmm control device",
1942 		    __progname);
1943 		return;
1944 	}
1945 
1946 	memset(&vmmci, 0, sizeof(vmmci));
1947 	vmmci.cfg.device_feature = VMMCI_F_TIMESYNC | VMMCI_F_ACK |
1948 	    VMMCI_F_SYNCRTC;
1949 	vmmci.vm_id = vcp->vcp_id;
1950 	vmmci.irq = pci_get_dev_irq(id);
1951 	vmmci.pci_id = id;
1952 
1953 	evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
1954 }
1955 
1956 /*
1957  * vionet_set_hostmac
1958  *
1959  * Sets the hardware address for the host-side tap(4) on a vionet_dev.
1960  *
1961  * This should only be called from the event-loop thread
1962  *
1963  * vm: pointer to the current vmd_vm instance
1964  * idx: index into the array of vionet_dev's for the target vionet_dev
1965  * addr: ethernet address to set
1966  */
1967 void
1968 vionet_set_hostmac(struct vmd_vm *vm, unsigned int idx, uint8_t *addr)
1969 {
1970 	struct vmop_create_params *vmc = &vm->vm_params;
1971 	struct vm_create_params	  *vcp = &vmc->vmc_params;
1972 	struct vionet_dev	  *dev;
1973 
1974 	if (idx > vcp->vcp_nnics)
1975 		fatalx("vionet_set_hostmac");
1976 
1977 	dev = &vionet[idx];
1978 	memcpy(dev->hostmac, addr, sizeof(dev->hostmac));
1979 }
1980 
1981 void
1982 virtio_shutdown(struct vmd_vm *vm)
1983 {
1984 	int i;
1985 
1986 	/* ensure that our disks are synced */
1987 	if (vioscsi != NULL)
1988 		vioscsi->file.close(vioscsi->file.p, 0);
1989 
1990 	for (i = 0; i < nr_vioblk; i++)
1991 		vioblk[i].file.close(vioblk[i].file.p, 0);
1992 }
1993 
1994 int
1995 vmmci_restore(int fd, uint32_t vm_id)
1996 {
1997 	log_debug("%s: receiving vmmci", __func__);
1998 	if (atomicio(read, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
1999 		log_warnx("%s: error reading vmmci from fd", __func__);
2000 		return (-1);
2001 	}
2002 
2003 	if (pci_set_bar_fn(vmmci.pci_id, 0, vmmci_io, NULL)) {
2004 		log_warnx("%s: can't set bar fn for vmm control device",
2005 		    __progname);
2006 		return (-1);
2007 	}
2008 	vmmci.vm_id = vm_id;
2009 	vmmci.irq = pci_get_dev_irq(vmmci.pci_id);
2010 	memset(&vmmci.timeout, 0, sizeof(struct event));
2011 	evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
2012 	return (0);
2013 }
2014 
2015 int
2016 viornd_restore(int fd, struct vm_create_params *vcp)
2017 {
2018 	log_debug("%s: receiving viornd", __func__);
2019 	if (atomicio(read, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
2020 		log_warnx("%s: error reading viornd from fd", __func__);
2021 		return (-1);
2022 	}
2023 	if (pci_set_bar_fn(viornd.pci_id, 0, virtio_rnd_io, NULL)) {
2024 		log_warnx("%s: can't set bar fn for virtio rng device",
2025 		    __progname);
2026 		return (-1);
2027 	}
2028 	viornd.vm_id = vcp->vcp_id;
2029 	viornd.irq = pci_get_dev_irq(viornd.pci_id);
2030 
2031 	return (0);
2032 }
2033 
2034 int
2035 vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
2036 {
2037 	struct vmop_create_params *vmc = &vm->vm_params;
2038 	struct vm_create_params *vcp = &vmc->vmc_params;
2039 	uint8_t i;
2040 	int ret;
2041 
2042 	nr_vionet = vcp->vcp_nnics;
2043 	if (vcp->vcp_nnics > 0) {
2044 		vionet = calloc(vcp->vcp_nnics, sizeof(struct vionet_dev));
2045 		if (vionet == NULL) {
2046 			log_warn("%s: calloc failure allocating vionets",
2047 			    __progname);
2048 			return (-1);
2049 		}
2050 		log_debug("%s: receiving vionet", __func__);
2051 		if (atomicio(read, fd, vionet,
2052 		    vcp->vcp_nnics * sizeof(struct vionet_dev)) !=
2053 		    vcp->vcp_nnics * sizeof(struct vionet_dev)) {
2054 			log_warnx("%s: error reading vionet from fd",
2055 			    __func__);
2056 			return (-1);
2057 		}
2058 
2059 		/* Virtio network */
2060 		for (i = 0; i < vcp->vcp_nnics; i++) {
2061 			if (pci_set_bar_fn(vionet[i].pci_id, 0, virtio_net_io,
2062 			    &vionet[i])) {
2063 				log_warnx("%s: can't set bar fn for virtio net "
2064 				    "device", __progname);
2065 				return (-1);
2066 			}
2067 
2068 			memset(&vionet[i].mutex, 0, sizeof(pthread_mutex_t));
2069 			ret = pthread_mutex_init(&vionet[i].mutex, NULL);
2070 
2071 			if (ret) {
2072 				errno = ret;
2073 				log_warn("%s: could not initialize mutex "
2074 				    "for vionet device", __progname);
2075 				return (-1);
2076 			}
2077 			vionet[i].fd = child_taps[i];
2078 			vionet[i].vm_id = vcp->vcp_id;
2079 			vionet[i].vm_vmid = vm->vm_vmid;
2080 			vionet[i].irq = pci_get_dev_irq(vionet[i].pci_id);
2081 
2082 			memset(&vionet[i].event, 0, sizeof(struct event));
2083 			event_set(&vionet[i].event, vionet[i].fd,
2084 			    EV_READ | EV_PERSIST, vionet_rx_event, &vionet[i]);
2085 		}
2086 	}
2087 	return (0);
2088 }
2089 
2090 int
2091 vioblk_restore(int fd, struct vmop_create_params *vmc,
2092     int child_disks[][VM_MAX_BASE_PER_DISK])
2093 {
2094 	struct vm_create_params *vcp = &vmc->vmc_params;
2095 	uint8_t i;
2096 
2097 	nr_vioblk = vcp->vcp_ndisks;
2098 	vioblk = calloc(vcp->vcp_ndisks, sizeof(struct vioblk_dev));
2099 	if (vioblk == NULL) {
2100 		log_warn("%s: calloc failure allocating vioblks", __progname);
2101 		return (-1);
2102 	}
2103 	log_debug("%s: receiving vioblk", __func__);
2104 	if (atomicio(read, fd, vioblk,
2105 	    nr_vioblk * sizeof(struct vioblk_dev)) !=
2106 	    nr_vioblk * sizeof(struct vioblk_dev)) {
2107 		log_warnx("%s: error reading vioblk from fd", __func__);
2108 		return (-1);
2109 	}
2110 	for (i = 0; i < vcp->vcp_ndisks; i++) {
2111 		if (pci_set_bar_fn(vioblk[i].pci_id, 0, virtio_blk_io,
2112 		    &vioblk[i])) {
2113 			log_warnx("%s: can't set bar fn for virtio block "
2114 			    "device", __progname);
2115 			return (-1);
2116 		}
2117 		if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
2118 		    child_disks[i], vmc->vmc_diskbases[i],
2119 		    vmc->vmc_disktypes[i]) == -1)  {
2120 			log_warnx("%s: unable to determine disk format",
2121 			    __func__);
2122 			return (-1);
2123 		}
2124 		vioblk[i].vm_id = vcp->vcp_id;
2125 		vioblk[i].irq = pci_get_dev_irq(vioblk[i].pci_id);
2126 	}
2127 	return (0);
2128 }
2129 
2130 int
2131 vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom)
2132 {
2133 	if (!strlen(vcp->vcp_cdrom))
2134 		return (0);
2135 
2136 	vioscsi = calloc(1, sizeof(struct vioscsi_dev));
2137 	if (vioscsi == NULL) {
2138 		log_warn("%s: calloc failure allocating vioscsi", __progname);
2139 		return (-1);
2140 	}
2141 
2142 	log_debug("%s: receiving vioscsi", __func__);
2143 
2144 	if (atomicio(read, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
2145 	    sizeof(struct vioscsi_dev)) {
2146 		log_warnx("%s: error reading vioscsi from fd", __func__);
2147 		return (-1);
2148 	}
2149 
2150 	if (pci_set_bar_fn(vioscsi->pci_id, 0, vioscsi_io, vioscsi)) {
2151 		log_warnx("%s: can't set bar fn for vmm control device",
2152 		    __progname);
2153 		return (-1);
2154 	}
2155 
2156 	if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, &child_cdrom, 1,
2157 	    VMDF_RAW) == -1) {
2158 		log_warnx("%s: unable to determine iso format", __func__);
2159 		return (-1);
2160 	}
2161 	vioscsi->vm_id = vcp->vcp_id;
2162 	vioscsi->irq = pci_get_dev_irq(vioscsi->pci_id);
2163 
2164 	return (0);
2165 }
2166 
2167 int
2168 virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
2169     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
2170 {
2171 	struct vmop_create_params *vmc = &vm->vm_params;
2172 	struct vm_create_params *vcp = &vmc->vmc_params;
2173 	int ret;
2174 
2175 	if ((ret = viornd_restore(fd, vcp)) == -1)
2176 		return ret;
2177 
2178 	if ((ret = vioblk_restore(fd, vmc, child_disks)) == -1)
2179 		return ret;
2180 
2181 	if ((ret = vioscsi_restore(fd, vcp, child_cdrom)) == -1)
2182 		return ret;
2183 
2184 	if ((ret = vionet_restore(fd, vm, child_taps)) == -1)
2185 		return ret;
2186 
2187 	if ((ret = vmmci_restore(fd, vcp->vcp_id)) == -1)
2188 		return ret;
2189 
2190 	return (0);
2191 }
2192 
2193 int
2194 viornd_dump(int fd)
2195 {
2196 	log_debug("%s: sending viornd", __func__);
2197 	if (atomicio(vwrite, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
2198 		log_warnx("%s: error writing viornd to fd", __func__);
2199 		return (-1);
2200 	}
2201 	return (0);
2202 }
2203 
2204 int
2205 vmmci_dump(int fd)
2206 {
2207 	log_debug("%s: sending vmmci", __func__);
2208 	if (atomicio(vwrite, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
2209 		log_warnx("%s: error writing vmmci to fd", __func__);
2210 		return (-1);
2211 	}
2212 	return (0);
2213 }
2214 
2215 int
2216 vionet_dump(int fd)
2217 {
2218 	log_debug("%s: sending vionet", __func__);
2219 	if (atomicio(vwrite, fd, vionet,
2220 	    nr_vionet * sizeof(struct vionet_dev)) !=
2221 	    nr_vionet * sizeof(struct vionet_dev)) {
2222 		log_warnx("%s: error writing vionet to fd", __func__);
2223 		return (-1);
2224 	}
2225 	return (0);
2226 }
2227 
2228 int
2229 vioblk_dump(int fd)
2230 {
2231 	log_debug("%s: sending vioblk", __func__);
2232 	if (atomicio(vwrite, fd, vioblk,
2233 	    nr_vioblk * sizeof(struct vioblk_dev)) !=
2234 	    nr_vioblk * sizeof(struct vioblk_dev)) {
2235 		log_warnx("%s: error writing vioblk to fd", __func__);
2236 		return (-1);
2237 	}
2238 	return (0);
2239 }
2240 
2241 int
2242 vioscsi_dump(int fd)
2243 {
2244 	if (vioscsi == NULL)
2245 		return (0);
2246 
2247 	log_debug("%s: sending vioscsi", __func__);
2248 	if (atomicio(vwrite, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
2249 	    sizeof(struct vioscsi_dev)) {
2250 		log_warnx("%s: error writing vioscsi to fd", __func__);
2251 		return (-1);
2252 	}
2253 	return (0);
2254 }
2255 
2256 int
2257 virtio_dump(int fd)
2258 {
2259 	int ret;
2260 
2261 	if ((ret = viornd_dump(fd)) == -1)
2262 		return ret;
2263 
2264 	if ((ret = vioblk_dump(fd)) == -1)
2265 		return ret;
2266 
2267 	if ((ret = vioscsi_dump(fd)) == -1)
2268 		return ret;
2269 
2270 	if ((ret = vionet_dump(fd)) == -1)
2271 		return ret;
2272 
2273 	if ((ret = vmmci_dump(fd)) == -1)
2274 		return ret;
2275 
2276 	return (0);
2277 }
2278 
2279 void
2280 virtio_stop(struct vm_create_params *vcp)
2281 {
2282 	uint8_t i;
2283 	for (i = 0; i < vcp->vcp_nnics; i++) {
2284 		if (event_del(&vionet[i].event)) {
2285 			log_warn("could not initialize vionet event "
2286 			    "handler");
2287 			return;
2288 		}
2289 	}
2290 }
2291 
2292 void
2293 virtio_start(struct vm_create_params *vcp)
2294 {
2295 	uint8_t i;
2296 	for (i = 0; i < vcp->vcp_nnics; i++) {
2297 		if (event_add(&vionet[i].event, NULL)) {
2298 			log_warn("could not initialize vionet event "
2299 			    "handler");
2300 			return;
2301 		}
2302 	}
2303 }
2304