xref: /openbsd-src/usr.sbin/vmd/vioblk.c (revision fc405d53b73a2d73393cb97f684863d17b583e38)
1 /*	$OpenBSD: vioblk.c,v 1.3 2023/05/13 23:15:28 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2023 Dave Voutila <dv@openbsd.org>
5  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 #include <sys/mman.h>
20 #include <sys/param.h> /* PAGE_SIZE */
21 
22 #include <dev/pci/virtio_pcireg.h>
23 #include <dev/pv/vioblkreg.h>
24 #include <dev/pv/virtioreg.h>
25 
26 #include <errno.h>
27 #include <event.h>
28 #include <fcntl.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <unistd.h>
32 
33 #include "atomicio.h"
34 #include "pci.h"
35 #include "virtio.h"
36 #include "vmd.h"
37 
38 extern char *__progname;
39 extern struct vmd_vm *current_vm;
40 
41 static const char *disk_type(int);
42 static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *);
43 static int handle_io_write(struct viodev_msg *, struct virtio_dev *);
44 void vioblk_notify_rx(struct vioblk_dev *);
45 int vioblk_notifyq(struct vioblk_dev *);
46 
47 static void dev_dispatch_vm(int, short, void *);
48 static void handle_sync_io(int, short, void *);
49 
50 static const char *
51 disk_type(int type)
52 {
53 	switch (type) {
54 	case VMDF_RAW: return "raw";
55 	case VMDF_QCOW2: return "qcow2";
56 	}
57 	return "unknown";
58 }
59 
60 __dead void
61 vioblk_main(int fd, int fd_vmm)
62 {
63 	struct virtio_dev	 dev;
64 	struct vioblk_dev	*vioblk;
65 	struct viodev_msg 	 msg;
66 	struct vmd_vm		 vm;
67 	struct vm_create_params	*vcp;
68 	ssize_t			 sz;
69 	off_t			 szp = 0;
70 	int			 i, ret, type;
71 
72 	log_procinit("vioblk");
73 
74 	/*
75 	 * stdio - needed for read/write to disk fds and channels to the vm.
76 	 * vmm + proc - needed to create shared vm mappings.
77 	 */
78 	if (pledge("stdio vmm proc", NULL) == -1)
79 		fatal("pledge");
80 
81 	/* Receive our virtio_dev, mostly preconfigured. */
82 	memset(&dev, 0, sizeof(dev));
83 	sz = atomicio(read, fd, &dev, sizeof(dev));
84 	if (sz != sizeof(dev)) {
85 		ret = errno;
86 		log_warn("failed to receive vionet");
87 		goto fail;
88 	}
89 	if (dev.dev_type != VMD_DEVTYPE_DISK) {
90 		ret = EINVAL;
91 		log_warn("received invalid device type");
92 		goto fail;
93 	}
94 	dev.sync_fd = fd;
95 	vioblk = &dev.vioblk;
96 
97 	log_debug("%s: got viblk dev. num disk fds = %d, sync fd = %d, "
98 	    "async fd = %d, sz = %lld maxfer = %d, vmm fd = %d", __func__,
99 	    vioblk->ndisk_fd, dev.sync_fd, dev.async_fd, vioblk->sz,
100 	    vioblk->max_xfer, fd_vmm);
101 
102 	/* Receive our vm information from the vm process. */
103 	memset(&vm, 0, sizeof(vm));
104 	sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm));
105 	if (sz != sizeof(vm)) {
106 		ret = EIO;
107 		log_warnx("failed to receive vm details");
108 		goto fail;
109 	}
110 	vcp = &vm.vm_params.vmc_params;
111 	current_vm = &vm;
112 	setproctitle("%s/vioblk[%d]", vcp->vcp_name, vioblk->idx);
113 
114 	/* Now that we have our vm information, we can remap memory. */
115 	ret = remap_guest_mem(&vm, fd_vmm);
116 	if (ret) {
117 		log_warnx("failed to remap guest memory");
118 		goto fail;
119 	}
120 
121 	/*
122 	 * We no longer need /dev/vmm access.
123 	 */
124 	close_fd(fd_vmm);
125 	if (pledge("stdio", NULL) == -1)
126 		fatal("pledge2");
127 
128 	/* Initialize the virtio block abstractions. */
129 	type = vm.vm_params.vmc_disktypes[vioblk->idx];
130 	switch (type) {
131 	case VMDF_RAW:
132 		ret = virtio_raw_init(&vioblk->file, &szp, vioblk->disk_fd,
133 		    vioblk->ndisk_fd);
134 		break;
135 	case VMDF_QCOW2:
136 		ret = virtio_qcow2_init(&vioblk->file, &szp, vioblk->disk_fd,
137 		    vioblk->ndisk_fd);
138 		break;
139 	default:
140 		log_warnx("invalid disk image type");
141 		goto fail;
142 	}
143 	if (ret || szp < 0) {
144 		log_warnx("failed to init disk %s image", disk_type(type));
145 		goto fail;
146 	}
147 	vioblk->sz = szp;
148 	log_debug("%s: initialized vioblk[%d] with %s image (sz=%lld)",
149 	    __func__, vioblk->idx, disk_type(type), vioblk->sz);
150 
151 	/* If we're restoring hardware, reinitialize the virtqueue hva. */
152 	if (vm.vm_state & VM_STATE_RECEIVED)
153 		vioblk_update_qa(vioblk);
154 
155 	/* Initialize libevent so we can start wiring event handlers. */
156 	event_init();
157 
158 	/* Wire up an async imsg channel. */
159 	log_debug("%s: wiring in async vm event handler (fd=%d)", __func__,
160 		dev.async_fd);
161 	if (vm_device_pipe(&dev, dev_dispatch_vm)) {
162 		ret = EIO;
163 		log_warnx("vm_device_pipe");
164 		goto fail;
165 	}
166 
167 	/* Configure our sync channel event handler. */
168 	log_debug("%s: wiring in sync channel handler (fd=%d)", __func__,
169 		dev.sync_fd);
170 	if (fcntl(dev.sync_fd, F_SETFL, O_NONBLOCK) == -1) {
171 		ret = errno;
172 		log_warn("%s: fcntl", __func__);
173 		goto fail;
174 	}
175 	imsg_init(&dev.sync_iev.ibuf, dev.sync_fd);
176 	dev.sync_iev.handler = handle_sync_io;
177 	dev.sync_iev.data = &dev;
178 	dev.sync_iev.events = EV_READ;
179 	imsg_event_add(&dev.sync_iev);
180 
181 	/* Send a ready message over the sync channel. */
182 	log_debug("%s: telling vm %s device is ready", __func__, vcp->vcp_name);
183 	memset(&msg, 0, sizeof(msg));
184 	msg.type = VIODEV_MSG_READY;
185 	imsg_compose_event(&dev.sync_iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
186 	    sizeof(msg));
187 
188 	/* Send a ready message over the async channel. */
189 	log_debug("%s: sending heartbeat", __func__);
190 	ret = imsg_compose_event(&dev.async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
191 	    &msg, sizeof(msg));
192 	if (ret == -1) {
193 		log_warnx("%s: failed to send async ready message!", __func__);
194 		goto fail;
195 	}
196 
197 	/* Engage the event loop! */
198 	ret = event_dispatch();
199 
200 	if (ret == 0) {
201 		/* Clean shutdown. */
202 		close_fd(dev.sync_fd);
203 		close_fd(dev.async_fd);
204 		for (i = 0; i < (int)sizeof(vioblk->disk_fd); i++)
205 			close_fd(vioblk->disk_fd[i]);
206 		_exit(0);
207 		/* NOTREACHED */
208 	}
209 
210 fail:
211 	/* Try letting the vm know we've failed something. */
212 	memset(&msg, 0, sizeof(msg));
213 	msg.type = VIODEV_MSG_ERROR;
214 	msg.data = ret;
215 	imsg_compose(&dev.sync_iev.ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
216 	    sizeof(msg));
217 	imsg_flush(&dev.sync_iev.ibuf);
218 
219 	close_fd(dev.sync_fd);
220 	close_fd(dev.async_fd);
221 	for (i = 0; i < (int)sizeof(vioblk->disk_fd); i++)
222 		close_fd(vioblk->disk_fd[i]);
223 	_exit(ret);
224 	/* NOTREACHED */
225 }
226 
227 const char *
228 vioblk_cmd_name(uint32_t type)
229 {
230 	switch (type) {
231 	case VIRTIO_BLK_T_IN: return "read";
232 	case VIRTIO_BLK_T_OUT: return "write";
233 	case VIRTIO_BLK_T_SCSI_CMD: return "scsi read";
234 	case VIRTIO_BLK_T_SCSI_CMD_OUT: return "scsi write";
235 	case VIRTIO_BLK_T_FLUSH: return "flush";
236 	case VIRTIO_BLK_T_FLUSH_OUT: return "flush out";
237 	case VIRTIO_BLK_T_GET_ID: return "get id";
238 	default: return "unknown";
239 	}
240 }
241 
242 void
243 vioblk_update_qa(struct vioblk_dev *dev)
244 {
245 	struct virtio_vq_info *vq_info;
246 	void *hva = NULL;
247 
248 	/* Invalid queue? */
249 	if (dev->cfg.queue_select > 0)
250 		return;
251 
252 	vq_info = &dev->vq[dev->cfg.queue_select];
253 	vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE;
254 
255 	hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIOBLK_QUEUE_SIZE));
256 	if (hva == NULL)
257 		fatal("vioblk_update_qa");
258 	vq_info->q_hva = hva;
259 }
260 
261 void
262 vioblk_update_qs(struct vioblk_dev *dev)
263 {
264 	struct virtio_vq_info *vq_info;
265 
266 	/* Invalid queue? */
267 	if (dev->cfg.queue_select > 0) {
268 		dev->cfg.queue_size = 0;
269 		return;
270 	}
271 
272 	vq_info = &dev->vq[dev->cfg.queue_select];
273 
274 	/* Update queue pfn/size based on queue select */
275 	dev->cfg.queue_pfn = vq_info->q_gpa >> 12;
276 	dev->cfg.queue_size = vq_info->qs;
277 }
278 
279 static void
280 vioblk_free_info(struct ioinfo *info)
281 {
282 	if (!info)
283 		return;
284 	free(info->buf);
285 	free(info);
286 }
287 
288 static struct ioinfo *
289 vioblk_start_read(struct vioblk_dev *dev, off_t sector, size_t sz)
290 {
291 	struct ioinfo *info;
292 
293 	/* Limit to 64M for now */
294 	if (sz > (1 << 26)) {
295 		log_warnx("%s: read size exceeded 64M", __func__);
296 		return (NULL);
297 	}
298 
299 	info = calloc(1, sizeof(*info));
300 	if (!info)
301 		goto nomem;
302 	info->buf = malloc(sz);
303 	if (info->buf == NULL)
304 		goto nomem;
305 	info->len = sz;
306 	info->offset = sector * VIRTIO_BLK_SECTOR_SIZE;
307 	info->file = &dev->file;
308 	return info;
309 
310 nomem:
311 	free(info);
312 	log_warn("malloc error vioblk read");
313 	return (NULL);
314 }
315 
316 
317 static const uint8_t *
318 vioblk_finish_read(struct ioinfo *info)
319 {
320 	struct virtio_backing *file;
321 
322 	file = info->file;
323 	if (file == NULL || file->pread == NULL) {
324 		log_warnx("%s: XXX null?!", __func__);
325 		return NULL;
326 	}
327 	if (file->pread(file->p, info->buf, info->len, info->offset) != info->len) {
328 		info->error = errno;
329 		log_warn("vioblk read error");
330 		return NULL;
331 	}
332 
333 	return info->buf;
334 }
335 
336 static struct ioinfo *
337 vioblk_start_write(struct vioblk_dev *dev, off_t sector,
338     paddr_t addr, size_t len)
339 {
340 	struct ioinfo *info;
341 
342 	/* Limit to 64M for now */
343 	if (len > (1 << 26)) {
344 		log_warnx("%s: write size exceeded 64M", __func__);
345 		return (NULL);
346 	}
347 
348 	info = calloc(1, sizeof(*info));
349 	if (!info)
350 		goto nomem;
351 
352 	info->buf = malloc(len);
353 	if (info->buf == NULL)
354 		goto nomem;
355 	info->len = len;
356 	info->offset = sector * VIRTIO_BLK_SECTOR_SIZE;
357 	info->file = &dev->file;
358 
359 	if (read_mem(addr, info->buf, info->len)) {
360 		vioblk_free_info(info);
361 		return NULL;
362 	}
363 
364 	return info;
365 
366 nomem:
367 	free(info);
368 	log_warn("malloc error vioblk write");
369 	return (NULL);
370 }
371 
372 static int
373 vioblk_finish_write(struct ioinfo *info)
374 {
375 	struct virtio_backing *file;
376 
377 	file = info->file;
378 	if (file->pwrite(file->p, info->buf, info->len, info->offset) != info->len) {
379 		log_warn("vioblk write error");
380 		return EIO;
381 	}
382 	return 0;
383 }
384 
385 /*
386  * XXX in various cases, ds should be set to VIRTIO_BLK_S_IOERR, if we can
387  */
388 int
389 vioblk_notifyq(struct vioblk_dev *dev)
390 {
391 	uint16_t idx, cmd_desc_idx, secdata_desc_idx, ds_desc_idx;
392 	uint8_t ds;
393 	int cnt;
394 	off_t secbias;
395 	char *vr;
396 	struct vring_desc *desc, *cmd_desc, *secdata_desc, *ds_desc;
397 	struct vring_avail *avail;
398 	struct vring_used *used;
399 	struct virtio_blk_req_hdr cmd;
400 	struct virtio_vq_info *vq_info;
401 
402 	/* Invalid queue? */
403 	if (dev->cfg.queue_notify > 0)
404 		return (0);
405 
406 	vq_info = &dev->vq[dev->cfg.queue_notify];
407 	vr = vq_info->q_hva;
408 	if (vr == NULL)
409 		fatalx("%s: null vring", __func__);
410 
411 	/* Compute offsets in ring of descriptors, avail ring, and used ring */
412 	desc = (struct vring_desc *)(vr);
413 	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
414 	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
415 
416 	idx = vq_info->last_avail & VIOBLK_QUEUE_MASK;
417 
418 	if ((avail->idx & VIOBLK_QUEUE_MASK) == idx) {
419 		log_debug("%s - nothing to do?", __func__);
420 		return (0);
421 	}
422 
423 	while (idx != (avail->idx & VIOBLK_QUEUE_MASK)) {
424 
425 		ds = VIRTIO_BLK_S_IOERR;
426 		cmd_desc_idx = avail->ring[idx] & VIOBLK_QUEUE_MASK;
427 		cmd_desc = &desc[cmd_desc_idx];
428 
429 		if ((cmd_desc->flags & VRING_DESC_F_NEXT) == 0) {
430 			log_warnx("unchained vioblk cmd descriptor received "
431 			    "(idx %d)", cmd_desc_idx);
432 			goto out;
433 		}
434 
435 		/* Read command from descriptor ring */
436 		if (cmd_desc->flags & VRING_DESC_F_WRITE) {
437 			log_warnx("vioblk: unexpected writable cmd descriptor "
438 			    "%d", cmd_desc_idx);
439 			goto out;
440 		}
441 		if (read_mem(cmd_desc->addr, &cmd, sizeof(cmd))) {
442 			log_warnx("vioblk: command read_mem error @ 0x%llx",
443 			    cmd_desc->addr);
444 			goto out;
445 		}
446 
447 		switch (cmd.type) {
448 		case VIRTIO_BLK_T_IN:
449 			/* first descriptor */
450 			secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
451 			secdata_desc = &desc[secdata_desc_idx];
452 
453 			if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
454 				log_warnx("unchained vioblk data descriptor "
455 				    "received (idx %d)", cmd_desc_idx);
456 				goto out;
457 			}
458 
459 			cnt = 0;
460 			secbias = 0;
461 			do {
462 				struct ioinfo *info;
463 				const uint8_t *secdata;
464 
465 				if ((secdata_desc->flags & VRING_DESC_F_WRITE)
466 				    == 0) {
467 					log_warnx("vioblk: unwritable data "
468 					    "descriptor %d", secdata_desc_idx);
469 					goto out;
470 				}
471 
472 				info = vioblk_start_read(dev,
473 				    cmd.sector + secbias, secdata_desc->len);
474 
475 				if (info == NULL) {
476 					log_warnx("vioblk: can't start read");
477 					goto out;
478 				}
479 
480 				/* read the data, use current data descriptor */
481 				secdata = vioblk_finish_read(info);
482 				if (secdata == NULL) {
483 					vioblk_free_info(info);
484 					log_warnx("vioblk: block read error, "
485 					    "sector %lld", cmd.sector);
486 					goto out;
487 				}
488 
489 				if (write_mem(secdata_desc->addr, secdata,
490 					secdata_desc->len)) {
491 					log_warnx("can't write sector "
492 					    "data to gpa @ 0x%llx",
493 					    secdata_desc->addr);
494 					vioblk_free_info(info);
495 					goto out;
496 				}
497 
498 				vioblk_free_info(info);
499 
500 				secbias += (secdata_desc->len /
501 				    VIRTIO_BLK_SECTOR_SIZE);
502 				secdata_desc_idx = secdata_desc->next &
503 				    VIOBLK_QUEUE_MASK;
504 				secdata_desc = &desc[secdata_desc_idx];
505 
506 				/* Guard against infinite chains */
507 				if (++cnt >= VIOBLK_QUEUE_SIZE) {
508 					log_warnx("%s: descriptor table "
509 					    "invalid", __func__);
510 					goto out;
511 				}
512 			} while (secdata_desc->flags & VRING_DESC_F_NEXT);
513 
514 			ds_desc_idx = secdata_desc_idx;
515 			ds_desc = secdata_desc;
516 
517 			ds = VIRTIO_BLK_S_OK;
518 			break;
519 		case VIRTIO_BLK_T_OUT:
520 			secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
521 			secdata_desc = &desc[secdata_desc_idx];
522 
523 			if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
524 				log_warnx("wr vioblk: unchained vioblk data "
525 				    "descriptor received (idx %d)",
526 				    cmd_desc_idx);
527 				goto out;
528 			}
529 
530 			if (secdata_desc->len > dev->max_xfer) {
531 				log_warnx("%s: invalid read size %d requested",
532 				    __func__, secdata_desc->len);
533 				goto out;
534 			}
535 
536 			cnt = 0;
537 			secbias = 0;
538 			do {
539 				struct ioinfo *info;
540 
541 				if (secdata_desc->flags & VRING_DESC_F_WRITE) {
542 					log_warnx("wr vioblk: unexpected "
543 					    "writable data descriptor %d",
544 					    secdata_desc_idx);
545 					goto out;
546 				}
547 
548 				info = vioblk_start_write(dev,
549 				    cmd.sector + secbias,
550 				    secdata_desc->addr, secdata_desc->len);
551 
552 				if (info == NULL) {
553 					log_warnx("wr vioblk: can't read "
554 					    "sector data @ 0x%llx",
555 					    secdata_desc->addr);
556 					goto out;
557 				}
558 
559 				if (vioblk_finish_write(info)) {
560 					log_warnx("wr vioblk: disk write "
561 					    "error");
562 					vioblk_free_info(info);
563 					goto out;
564 				}
565 
566 				vioblk_free_info(info);
567 
568 				secbias += secdata_desc->len /
569 				    VIRTIO_BLK_SECTOR_SIZE;
570 
571 				secdata_desc_idx = secdata_desc->next &
572 				    VIOBLK_QUEUE_MASK;
573 				secdata_desc = &desc[secdata_desc_idx];
574 
575 				/* Guard against infinite chains */
576 				if (++cnt >= VIOBLK_QUEUE_SIZE) {
577 					log_warnx("%s: descriptor table "
578 					    "invalid", __func__);
579 					goto out;
580 				}
581 			} while (secdata_desc->flags & VRING_DESC_F_NEXT);
582 
583 			ds_desc_idx = secdata_desc_idx;
584 			ds_desc = secdata_desc;
585 
586 			ds = VIRTIO_BLK_S_OK;
587 			break;
588 		case VIRTIO_BLK_T_FLUSH:
589 		case VIRTIO_BLK_T_FLUSH_OUT:
590 			ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
591 			ds_desc = &desc[ds_desc_idx];
592 
593 			ds = VIRTIO_BLK_S_UNSUPP;
594 			break;
595 		case VIRTIO_BLK_T_GET_ID:
596 			secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
597 			secdata_desc = &desc[secdata_desc_idx];
598 
599 			/*
600 			 * We don't support this command yet. While it's not
601 			 * officially part of the virtio spec (will be in v1.2)
602 			 * there's no feature to negotiate. Linux drivers will
603 			 * often send this command regardless.
604 			 *
605 			 * When the command is received, it should appear as a
606 			 * chain of 3 descriptors, similar to the IN/OUT
607 			 * commands. The middle descriptor should have have a
608 			 * length of VIRTIO_BLK_ID_BYTES bytes.
609 			 */
610 			if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) {
611 				log_warnx("id vioblk: unchained vioblk data "
612 				    "descriptor received (idx %d)",
613 				    cmd_desc_idx);
614 				goto out;
615 			}
616 
617 			/* Skip the data descriptor. */
618 			ds_desc_idx = secdata_desc->next & VIOBLK_QUEUE_MASK;
619 			ds_desc = &desc[ds_desc_idx];
620 
621 			ds = VIRTIO_BLK_S_UNSUPP;
622 			break;
623 		default:
624 			log_warnx("%s: unsupported command 0x%x", __func__,
625 			    cmd.type);
626 			ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK;
627 			ds_desc = &desc[ds_desc_idx];
628 
629 			ds = VIRTIO_BLK_S_UNSUPP;
630 			break;
631 		}
632 
633 		if ((ds_desc->flags & VRING_DESC_F_WRITE) == 0) {
634 			log_warnx("%s: ds descriptor %d unwritable", __func__,
635 			    ds_desc_idx);
636 			goto out;
637 		}
638 		if (write_mem(ds_desc->addr, &ds, sizeof(ds))) {
639 			log_warnx("%s: can't write device status data @ 0x%llx",
640 			    __func__, ds_desc->addr);
641 			goto out;
642 		}
643 
644 		dev->cfg.isr_status = 1;
645 		used->ring[used->idx & VIOBLK_QUEUE_MASK].id = cmd_desc_idx;
646 		used->ring[used->idx & VIOBLK_QUEUE_MASK].len = cmd_desc->len;
647 		__sync_synchronize();
648 		used->idx++;
649 
650 		vq_info->last_avail = avail->idx & VIOBLK_QUEUE_MASK;
651 		idx = (idx + 1) & VIOBLK_QUEUE_MASK;
652 	}
653 out:
654 	return (1);
655 }
656 
657 static void
658 dev_dispatch_vm(int fd, short event, void *arg)
659 {
660 	struct virtio_dev	*dev = (struct virtio_dev *)arg;
661 	struct imsgev		*iev = &dev->async_iev;
662 	struct imsgbuf		*ibuf = &iev->ibuf;
663 	struct imsg	 	 imsg;
664 	ssize_t			 n = 0;
665 
666 	if (event & EV_READ) {
667 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
668 			fatal("%s: imsg_read", __func__);
669 		if (n == 0) {
670 			/* this pipe is dead, so remove the event handler */
671 			log_debug("%s: pipe dead (EV_READ)", __func__);
672 			event_del(&iev->ev);
673 			event_loopexit(NULL);
674 			return;
675 		}
676 	}
677 
678 	if (event & EV_WRITE) {
679 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
680 			fatal("%s: msgbuf_write", __func__);
681 		if (n == 0) {
682 			/* this pipe is dead, so remove the event handler */
683 			log_debug("%s: pipe dead (EV_WRITE)", __func__);
684 			event_del(&iev->ev);
685 			event_loopbreak();
686 			return;
687 		}
688 	}
689 
690 	for (;;) {
691 		if ((n = imsg_get(ibuf, &imsg)) == -1)
692 			fatal("%s: imsg_get", __func__);
693 		if (n == 0)
694 			break;
695 
696 		switch (imsg.hdr.type) {
697 		case IMSG_VMDOP_PAUSE_VM:
698 			log_debug("%s: pausing", __func__);
699 			break;
700 		case IMSG_VMDOP_UNPAUSE_VM:
701 			log_debug("%s: unpausing", __func__);
702 			break;
703 		default:
704 			log_warnx("%s: unhandled imsg type %d", __func__,
705 			    imsg.hdr.type);
706 			break;
707 		}
708 		imsg_free(&imsg);
709 	}
710 	imsg_event_add(iev);
711 }
712 
713 /*
714  * Synchronous IO handler.
715  *
716  */
717 static void
718 handle_sync_io(int fd, short event, void *arg)
719 {
720 	struct virtio_dev *dev = (struct virtio_dev *)arg;
721 	struct imsgev *iev = &dev->sync_iev;
722 	struct imsgbuf *ibuf = &iev->ibuf;
723 	struct viodev_msg msg;
724 	struct imsg imsg;
725 	ssize_t n;
726 
727 	if (event & EV_READ) {
728 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
729 			fatal("%s: imsg_read", __func__);
730 		if (n == 0) {
731 			/* this pipe is dead, so remove the event handler */
732 			log_debug("%s: vioblk pipe dead (EV_READ)", __func__);
733 			event_del(&iev->ev);
734 			event_loopexit(NULL);
735 			return;
736 		}
737 	}
738 
739 	if (event & EV_WRITE) {
740 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
741 			fatal("%s: msgbuf_write", __func__);
742 		if (n == 0) {
743 			/* this pipe is dead, so remove the event handler */
744 			log_debug("%s: vioblk pipe dead (EV_WRITE)", __func__);
745 			event_del(&iev->ev);
746 			event_loopexit(NULL);
747 			return;
748 		}
749 	}
750 
751 	for (;;) {
752 		if ((n = imsg_get(ibuf, &imsg)) == -1)
753 			fatalx("%s: imsg_get (n=%ld)", __func__, n);
754 		if (n == 0)
755 			break;
756 
757 		/* Unpack our message. They ALL should be dev messeges! */
758 		IMSG_SIZE_CHECK(&imsg, &msg);
759 		memcpy(&msg, imsg.data, sizeof(msg));
760 		imsg_free(&imsg);
761 
762 		switch (msg.type) {
763 		case VIODEV_MSG_DUMP:
764 			/* Dump device */
765 			n = atomicio(vwrite, dev->sync_fd, dev, sizeof(*dev));
766 			if (n != sizeof(*dev)) {
767 				log_warnx("%s: failed to dump vioblk device",
768 				    __func__);
769 				break;
770 			}
771 		case VIODEV_MSG_IO_READ:
772 			/* Read IO: make sure to send a reply */
773 			msg.data = handle_io_read(&msg, dev);
774 			msg.data_valid = 1;
775 			imsg_compose_event(iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
776 			    sizeof(msg));
777 			break;
778 		case VIODEV_MSG_IO_WRITE:
779 			/* Write IO: no reply needed */
780 			if (handle_io_write(&msg, dev) == 1)
781 				virtio_assert_pic_irq(dev, 0);
782 			break;
783 		case VIODEV_MSG_SHUTDOWN:
784 			event_del(&dev->sync_iev.ev);
785 			event_loopbreak();
786 			return;
787 		default:
788 			fatalx("%s: invalid msg type %d", __func__, msg.type);
789 		}
790 	}
791 	imsg_event_add(iev);
792 }
793 
794 static int
795 handle_io_write(struct viodev_msg *msg, struct virtio_dev *dev)
796 {
797 	struct vioblk_dev *vioblk = &dev->vioblk;
798 	uint32_t data = msg->data;
799 	int intr = 0;
800 
801 	switch (msg->reg) {
802 	case VIRTIO_CONFIG_DEVICE_FEATURES:
803 	case VIRTIO_CONFIG_QUEUE_SIZE:
804 	case VIRTIO_CONFIG_ISR_STATUS:
805 		log_warnx("%s: illegal write %x to %s", __progname, data,
806 		    virtio_reg_name(msg->reg));
807 		break;
808 	case VIRTIO_CONFIG_GUEST_FEATURES:
809 		vioblk->cfg.guest_feature = data;
810 		break;
811 	case VIRTIO_CONFIG_QUEUE_PFN:
812 		vioblk->cfg.queue_pfn = data;
813 		vioblk_update_qa(vioblk);
814 		break;
815 	case VIRTIO_CONFIG_QUEUE_SELECT:
816 		vioblk->cfg.queue_select = data;
817 		vioblk_update_qs(vioblk);
818 		break;
819 	case VIRTIO_CONFIG_QUEUE_NOTIFY:
820 		vioblk->cfg.queue_notify = data;
821 		if (vioblk_notifyq(vioblk))
822 			intr = 1;
823 		break;
824 	case VIRTIO_CONFIG_DEVICE_STATUS:
825 		vioblk->cfg.device_status = data;
826 		if (vioblk->cfg.device_status == 0) {
827 			vioblk->cfg.guest_feature = 0;
828 			vioblk->cfg.queue_pfn = 0;
829 			vioblk_update_qa(vioblk);
830 			vioblk->cfg.queue_size = 0;
831 			vioblk_update_qs(vioblk);
832 			vioblk->cfg.queue_select = 0;
833 			vioblk->cfg.queue_notify = 0;
834 			vioblk->cfg.isr_status = 0;
835 			vioblk->vq[0].last_avail = 0;
836 			vioblk->vq[0].notified_avail = 0;
837 			virtio_deassert_pic_irq(dev, msg->vcpu);
838 		}
839 		break;
840 	default:
841 		break;
842 	}
843 	return (intr);
844 }
845 
846 static uint32_t
847 handle_io_read(struct viodev_msg *msg, struct virtio_dev *dev)
848 {
849 	struct vioblk_dev *vioblk = &dev->vioblk;
850 	uint8_t sz = msg->io_sz;
851 	uint32_t data;
852 
853 	if (msg->data_valid)
854 		data = msg->data;
855 	else
856 		data = 0;
857 
858 	switch (msg->reg) {
859 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
860 		switch (sz) {
861 		case 4:
862 			data = (uint32_t)(vioblk->sz);
863 			break;
864 		case 2:
865 			data &= 0xFFFF0000;
866 			data |= (uint32_t)(vioblk->sz) & 0xFFFF;
867 			break;
868 		case 1:
869 			data &= 0xFFFFFF00;
870 			data |= (uint32_t)(vioblk->sz) & 0xFF;
871 			break;
872 		}
873 		/* XXX handle invalid sz */
874 		break;
875 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
876 		if (sz == 1) {
877 			data &= 0xFFFFFF00;
878 			data |= (uint32_t)(vioblk->sz >> 8) & 0xFF;
879 		}
880 		/* XXX handle invalid sz */
881 		break;
882 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
883 		if (sz == 1) {
884 			data &= 0xFFFFFF00;
885 			data |= (uint32_t)(vioblk->sz >> 16) & 0xFF;
886 		} else if (sz == 2) {
887 			data &= 0xFFFF0000;
888 			data |= (uint32_t)(vioblk->sz >> 16) & 0xFFFF;
889 		}
890 		/* XXX handle invalid sz */
891 		break;
892 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
893 		if (sz == 1) {
894 			data &= 0xFFFFFF00;
895 			data |= (uint32_t)(vioblk->sz >> 24) & 0xFF;
896 		}
897 		/* XXX handle invalid sz */
898 		break;
899 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
900 		switch (sz) {
901 		case 4:
902 			data = (uint32_t)(vioblk->sz >> 32);
903 			break;
904 		case 2:
905 			data &= 0xFFFF0000;
906 			data |= (uint32_t)(vioblk->sz >> 32) & 0xFFFF;
907 			break;
908 		case 1:
909 			data &= 0xFFFFFF00;
910 			data |= (uint32_t)(vioblk->sz >> 32) & 0xFF;
911 			break;
912 		}
913 		/* XXX handle invalid sz */
914 		break;
915 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
916 		if (sz == 1) {
917 			data &= 0xFFFFFF00;
918 			data |= (uint32_t)(vioblk->sz >> 40) & 0xFF;
919 		}
920 		/* XXX handle invalid sz */
921 		break;
922 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 6:
923 		if (sz == 1) {
924 			data &= 0xFFFFFF00;
925 			data |= (uint32_t)(vioblk->sz >> 48) & 0xFF;
926 		} else if (sz == 2) {
927 			data &= 0xFFFF0000;
928 			data |= (uint32_t)(vioblk->sz >> 48) & 0xFFFF;
929 		}
930 		/* XXX handle invalid sz */
931 		break;
932 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 7:
933 		if (sz == 1) {
934 			data &= 0xFFFFFF00;
935 			data |= (uint32_t)(vioblk->sz >> 56) & 0xFF;
936 		}
937 		/* XXX handle invalid sz */
938 		break;
939 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
940 		switch (sz) {
941 		case 4:
942 			data = (uint32_t)(vioblk->max_xfer);
943 			break;
944 		case 2:
945 			data &= 0xFFFF0000;
946 			data |= (uint32_t)(vioblk->max_xfer) & 0xFFFF;
947 			break;
948 		case 1:
949 			data &= 0xFFFFFF00;
950 			data |= (uint32_t)(vioblk->max_xfer) & 0xFF;
951 			break;
952 		}
953 		/* XXX handle invalid sz */
954 		break;
955 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 9:
956 		if (sz == 1) {
957 			data &= 0xFFFFFF00;
958 			data |= (uint32_t)(vioblk->max_xfer >> 8) & 0xFF;
959 		}
960 		/* XXX handle invalid sz */
961 		break;
962 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 10:
963 		if (sz == 1) {
964 			data &= 0xFFFFFF00;
965 			data |= (uint32_t)(vioblk->max_xfer >> 16) & 0xFF;
966 		} else if (sz == 2) {
967 			data &= 0xFFFF0000;
968 			data |= (uint32_t)(vioblk->max_xfer >> 16)
969 			    & 0xFFFF;
970 		}
971 		/* XXX handle invalid sz */
972 		break;
973 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 11:
974 		if (sz == 1) {
975 			data &= 0xFFFFFF00;
976 			data |= (uint32_t)(vioblk->max_xfer >> 24) & 0xFF;
977 		}
978 		/* XXX handle invalid sz */
979 		break;
980 	case VIRTIO_CONFIG_DEVICE_FEATURES:
981 		data = vioblk->cfg.device_feature;
982 		break;
983 	case VIRTIO_CONFIG_GUEST_FEATURES:
984 		data = vioblk->cfg.guest_feature;
985 		break;
986 	case VIRTIO_CONFIG_QUEUE_PFN:
987 		data = vioblk->cfg.queue_pfn;
988 		break;
989 	case VIRTIO_CONFIG_QUEUE_SIZE:
990 		data = vioblk->cfg.queue_size;
991 		break;
992 	case VIRTIO_CONFIG_QUEUE_SELECT:
993 		data = vioblk->cfg.queue_select;
994 		break;
995 	case VIRTIO_CONFIG_QUEUE_NOTIFY:
996 		data = vioblk->cfg.queue_notify;
997 		break;
998 	case VIRTIO_CONFIG_DEVICE_STATUS:
999 		data = vioblk->cfg.device_status;
1000 		break;
1001 	case VIRTIO_CONFIG_ISR_STATUS:
1002 		data = vioblk->cfg.isr_status;
1003 		vioblk->cfg.isr_status = 0;
1004 		virtio_deassert_pic_irq(dev, 0);
1005 		break;
1006 	default:
1007 		return (0xFFFFFFFF);
1008 	}
1009 
1010 	return (data);
1011 }
1012