xref: /openbsd-src/usr.sbin/vmd/vioblk.c (revision 8fbe85f50ddf75fc9249e5d62c94aa31cb6b07d3)
1 /*	$OpenBSD: vioblk.c,v 1.21 2024/11/27 22:32:14 kirill Exp $	*/
2 
3 /*
4  * Copyright (c) 2023 Dave Voutila <dv@openbsd.org>
5  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 #include <stdint.h>
20 
21 #include <dev/pci/virtio_pcireg.h>
22 #include <dev/pv/vioblkreg.h>
23 #include <dev/pv/virtioreg.h>
24 
25 #include <errno.h>
26 #include <event.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30 
31 #include "atomicio.h"
32 #include "pci.h"
33 #include "virtio.h"
34 #include "vmd.h"
35 
36 extern char *__progname;
37 extern struct vmd_vm *current_vm;
38 struct iovec io_v[VIOBLK_QUEUE_SIZE];
39 
40 static const char *disk_type(int);
41 static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *,
42     int8_t *);
43 static int handle_io_write(struct viodev_msg *, struct virtio_dev *);
44 
45 static void vioblk_update_qs(struct vioblk_dev *);
46 static void vioblk_update_qa(struct vioblk_dev *);
47 static int vioblk_notifyq(struct vioblk_dev *);
48 static ssize_t vioblk_rw(struct vioblk_dev *, int, off_t,
49     struct vring_desc *, struct vring_desc **);
50 
51 static void dev_dispatch_vm(int, short, void *);
52 static void handle_sync_io(int, short, void *);
53 
54 static const char *
55 disk_type(int type)
56 {
57 	switch (type) {
58 	case VMDF_RAW: return "raw";
59 	case VMDF_QCOW2: return "qcow2";
60 	}
61 	return "unknown";
62 }
63 
64 __dead void
65 vioblk_main(int fd, int fd_vmm)
66 {
67 	struct virtio_dev	 dev;
68 	struct vioblk_dev	*vioblk = NULL;
69 	struct viodev_msg 	 msg;
70 	struct vmd_vm		 vm;
71 	struct vm_create_params	*vcp;
72 	ssize_t			 sz;
73 	off_t			 szp = 0;
74 	int			 i, ret, type;
75 
76 	/*
77 	 * stdio - needed for read/write to disk fds and channels to the vm.
78 	 * vmm + proc - needed to create shared vm mappings.
79 	 */
80 	if (pledge("stdio vmm proc", NULL) == -1)
81 		fatal("pledge");
82 
83 	/* Zero and initialize io work queue. */
84 	memset(io_v, 0, nitems(io_v)*sizeof(io_v[0]));
85 
86 	/* Receive our virtio_dev, mostly preconfigured. */
87 	memset(&dev, 0, sizeof(dev));
88 	sz = atomicio(read, fd, &dev, sizeof(dev));
89 	if (sz != sizeof(dev)) {
90 		ret = errno;
91 		log_warn("failed to receive vioblk");
92 		goto fail;
93 	}
94 	if (dev.dev_type != VMD_DEVTYPE_DISK) {
95 		ret = EINVAL;
96 		log_warn("received invalid device type");
97 		goto fail;
98 	}
99 	dev.sync_fd = fd;
100 	vioblk = &dev.vioblk;
101 
102 	log_debug("%s: got viblk dev. num disk fds = %d, sync fd = %d, "
103 	    "async fd = %d, capacity = %lld seg_max = %u, vmm fd = %d",
104 	    __func__, vioblk->ndisk_fd, dev.sync_fd, dev.async_fd,
105 	    vioblk->capacity, vioblk->seg_max, fd_vmm);
106 
107 	/* Receive our vm information from the vm process. */
108 	memset(&vm, 0, sizeof(vm));
109 	sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm));
110 	if (sz != sizeof(vm)) {
111 		ret = EIO;
112 		log_warnx("failed to receive vm details");
113 		goto fail;
114 	}
115 	vcp = &vm.vm_params.vmc_params;
116 	current_vm = &vm;
117 
118 	setproctitle("%s/vioblk%d", vcp->vcp_name, vioblk->idx);
119 	log_procinit("vm/%s/vioblk%d", vcp->vcp_name, vioblk->idx);
120 
121 	/* Now that we have our vm information, we can remap memory. */
122 	ret = remap_guest_mem(&vm, fd_vmm);
123 	if (ret) {
124 		log_warnx("failed to remap guest memory");
125 		goto fail;
126 	}
127 
128 	/*
129 	 * We no longer need /dev/vmm access.
130 	 */
131 	close_fd(fd_vmm);
132 	if (pledge("stdio", NULL) == -1)
133 		fatal("pledge2");
134 
135 	/* Initialize the virtio block abstractions. */
136 	type = vm.vm_params.vmc_disktypes[vioblk->idx];
137 	switch (type) {
138 	case VMDF_RAW:
139 		ret = virtio_raw_init(&vioblk->file, &szp, vioblk->disk_fd,
140 		    vioblk->ndisk_fd);
141 		break;
142 	case VMDF_QCOW2:
143 		ret = virtio_qcow2_init(&vioblk->file, &szp, vioblk->disk_fd,
144 		    vioblk->ndisk_fd);
145 		break;
146 	default:
147 		log_warnx("invalid disk image type");
148 		goto fail;
149 	}
150 	if (ret || szp < 0) {
151 		log_warnx("failed to init disk %s image", disk_type(type));
152 		goto fail;
153 	}
154 	vioblk->capacity = szp / 512;
155 	log_debug("%s: initialized vioblk%d with %s image (capacity=%lld)",
156 	    __func__, vioblk->idx, disk_type(type), vioblk->capacity);
157 
158 	/* If we're restoring hardware, reinitialize the virtqueue hva. */
159 	if (vm.vm_state & VM_STATE_RECEIVED)
160 		vioblk_update_qa(vioblk);
161 
162 	/* Initialize libevent so we can start wiring event handlers. */
163 	event_init();
164 
165 	/* Wire up an async imsg channel. */
166 	log_debug("%s: wiring in async vm event handler (fd=%d)", __func__,
167 		dev.async_fd);
168 	if (vm_device_pipe(&dev, dev_dispatch_vm, NULL)) {
169 		ret = EIO;
170 		log_warnx("vm_device_pipe");
171 		goto fail;
172 	}
173 
174 	/* Configure our sync channel event handler. */
175 	log_debug("%s: wiring in sync channel handler (fd=%d)", __func__,
176 		dev.sync_fd);
177 	if (imsgbuf_init(&dev.sync_iev.ibuf, dev.sync_fd) == -1) {
178 		log_warn("imsgbuf_init");
179 		goto fail;
180 	}
181 	imsgbuf_allow_fdpass(&dev.sync_iev.ibuf);
182 	dev.sync_iev.handler = handle_sync_io;
183 	dev.sync_iev.data = &dev;
184 	dev.sync_iev.events = EV_READ;
185 	imsg_event_add(&dev.sync_iev);
186 
187 	/* Send a ready message over the sync channel. */
188 	log_debug("%s: telling vm %s device is ready", __func__, vcp->vcp_name);
189 	memset(&msg, 0, sizeof(msg));
190 	msg.type = VIODEV_MSG_READY;
191 	imsg_compose_event(&dev.sync_iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
192 	    sizeof(msg));
193 
194 	/* Send a ready message over the async channel. */
195 	log_debug("%s: sending heartbeat", __func__);
196 	ret = imsg_compose_event(&dev.async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
197 	    &msg, sizeof(msg));
198 	if (ret == -1) {
199 		log_warnx("%s: failed to send async ready message!", __func__);
200 		goto fail;
201 	}
202 
203 	/* Engage the event loop! */
204 	ret = event_dispatch();
205 
206 	if (ret == 0) {
207 		/* Clean shutdown. */
208 		close_fd(dev.sync_fd);
209 		close_fd(dev.async_fd);
210 		for (i = 0; i < vioblk->ndisk_fd; i++)
211 			close_fd(vioblk->disk_fd[i]);
212 		_exit(0);
213 		/* NOTREACHED */
214 	}
215 
216 fail:
217 	/* Try letting the vm know we've failed something. */
218 	memset(&msg, 0, sizeof(msg));
219 	msg.type = VIODEV_MSG_ERROR;
220 	msg.data = ret;
221 	imsg_compose(&dev.sync_iev.ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
222 	    sizeof(msg));
223 	imsgbuf_flush(&dev.sync_iev.ibuf);
224 
225 	close_fd(dev.sync_fd);
226 	close_fd(dev.async_fd);
227 	if (vioblk != NULL) {
228 		for (i = 0; i < vioblk->ndisk_fd; i++)
229 			close_fd(vioblk->disk_fd[i]);
230 	}
231 	_exit(ret);
232 	/* NOTREACHED */
233 }
234 
235 const char *
236 vioblk_cmd_name(uint32_t type)
237 {
238 	switch (type) {
239 	case VIRTIO_BLK_T_IN: return "read";
240 	case VIRTIO_BLK_T_OUT: return "write";
241 	case VIRTIO_BLK_T_SCSI_CMD: return "scsi read";
242 	case VIRTIO_BLK_T_SCSI_CMD_OUT: return "scsi write";
243 	case VIRTIO_BLK_T_FLUSH: return "flush";
244 	case VIRTIO_BLK_T_FLUSH_OUT: return "flush out";
245 	case VIRTIO_BLK_T_GET_ID: return "get id";
246 	default: return "unknown";
247 	}
248 }
249 
250 static void
251 vioblk_update_qa(struct vioblk_dev *dev)
252 {
253 	struct virtio_vq_info *vq_info;
254 	void *hva = NULL;
255 
256 	/* Invalid queue? */
257 	if (dev->cfg.queue_select > 0)
258 		return;
259 
260 	vq_info = &dev->vq[dev->cfg.queue_select];
261 	vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE;
262 
263 	hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIOBLK_QUEUE_SIZE));
264 	if (hva == NULL)
265 		fatal("vioblk_update_qa");
266 	vq_info->q_hva = hva;
267 }
268 
269 static void
270 vioblk_update_qs(struct vioblk_dev *dev)
271 {
272 	struct virtio_vq_info *vq_info;
273 
274 	/* Invalid queue? */
275 	if (dev->cfg.queue_select > 0) {
276 		dev->cfg.queue_size = 0;
277 		return;
278 	}
279 
280 	vq_info = &dev->vq[dev->cfg.queue_select];
281 
282 	/* Update queue pfn/size based on queue select */
283 	dev->cfg.queue_pfn = vq_info->q_gpa >> 12;
284 	dev->cfg.queue_size = vq_info->qs;
285 }
286 
287 /*
288  * Process virtqueue notifications. If an unrecoverable error occurs, puts
289  * device into a "needs reset" state.
290  *
291  * Returns 1 if an we need to assert an IRQ.
292  */
293 static int
294 vioblk_notifyq(struct vioblk_dev *dev)
295 {
296 	uint32_t cmd_len;
297 	uint16_t idx, cmd_desc_idx;
298 	uint8_t ds;
299 	off_t offset;
300 	ssize_t sz;
301 	int is_write, notify = 0, i;
302 	char *vr;
303 	struct vring_desc *table, *desc;
304 	struct vring_avail *avail;
305 	struct vring_used *used;
306 	struct virtio_blk_req_hdr *cmd;
307 	struct virtio_vq_info *vq_info;
308 
309 	/* Invalid queue? */
310 	if (dev->cfg.queue_notify > 0)
311 		return (0);
312 
313 	vq_info = &dev->vq[dev->cfg.queue_notify];
314 	idx = vq_info->last_avail;
315 	vr = vq_info->q_hva;
316 	if (vr == NULL)
317 		fatalx("%s: null vring", __func__);
318 
319 	/* Compute offsets in table of descriptors, avail ring, and used ring */
320 	table = (struct vring_desc *)(vr);
321 	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
322 	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
323 
324 	while (idx != avail->idx) {
325 		/* Retrieve Command descriptor. */
326 		cmd_desc_idx = avail->ring[idx & VIOBLK_QUEUE_MASK];
327 		desc = &table[cmd_desc_idx];
328 		cmd_len = desc->len;
329 
330 		/*
331 		 * Validate Command descriptor. It should be chained to another
332 		 * descriptor and not be itself writable.
333 		 */
334 		if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
335 			log_warnx("%s: unchained cmd descriptor", __func__);
336 			goto reset;
337 		}
338 		if (DESC_WRITABLE(desc)) {
339 			log_warnx("%s: invalid cmd descriptor state", __func__);
340 			goto reset;
341 		}
342 
343 		/* Retrieve the vioblk command request. */
344 		cmd = hvaddr_mem(desc->addr, sizeof(*cmd));
345 		if (cmd == NULL)
346 			goto reset;
347 
348 		/* Advance to the 2nd descriptor. */
349 		desc = &table[desc->next & VIOBLK_QUEUE_MASK];
350 
351 		/* Process each available command & chain. */
352 		switch (cmd->type) {
353 		case VIRTIO_BLK_T_IN:
354 		case VIRTIO_BLK_T_OUT:
355 			/* Read (IN) & Write (OUT) */
356 			is_write = (cmd->type == VIRTIO_BLK_T_OUT) ? 1 : 0;
357 			offset = cmd->sector * VIRTIO_BLK_SECTOR_SIZE;
358 			sz = vioblk_rw(dev, is_write, offset, table, &desc);
359 			if (sz == -1)
360 				ds = VIRTIO_BLK_S_IOERR;
361 			else
362 				ds = VIRTIO_BLK_S_OK;
363 			break;
364 		case VIRTIO_BLK_T_GET_ID:
365 			/*
366 			 * We don't support this command yet. While it's not
367 			 * officially part of the virtio spec (will be in v1.2)
368 			 * there's no feature to negotiate. Linux drivers will
369 			 * often send this command regardless.
370 			 */
371 			ds = VIRTIO_BLK_S_UNSUPP;
372 			break;
373 		default:
374 			log_warnx("%s: unsupported vioblk command %d", __func__,
375 			    cmd->type);
376 			ds = VIRTIO_BLK_S_UNSUPP;
377 			break;
378 		}
379 
380 		/* Advance to the end of the chain, if needed. */
381 		i = 0;
382 		while (desc->flags & VRING_DESC_F_NEXT) {
383 			desc = &table[desc->next & VIOBLK_QUEUE_MASK];
384 			if (++i >= VIOBLK_QUEUE_SIZE) {
385 				/*
386 				 * If we encounter an infinite/looping chain,
387 				 * not much we can do but say we need a reset.
388 				 */
389 				log_warnx("%s: descriptor chain overflow",
390 				    __func__);
391 				goto reset;
392 			}
393 		}
394 
395 		/* Provide the status of our command processing. */
396 		if (!DESC_WRITABLE(desc)) {
397 			log_warnx("%s: status descriptor unwritable", __func__);
398 			goto reset;
399 		}
400 		/* Overkill as ds is 1 byte, but validates gpa. */
401 		if (write_mem(desc->addr, &ds, sizeof(ds)))
402 			log_warnx("%s: can't write device status data "
403 			    "@ 0x%llx",__func__, desc->addr);
404 
405 		dev->cfg.isr_status |= 1;
406 		notify = 1;
407 
408 		used->ring[used->idx & VIOBLK_QUEUE_MASK].id = cmd_desc_idx;
409 		used->ring[used->idx & VIOBLK_QUEUE_MASK].len = cmd_len;
410 
411 		__sync_synchronize();
412 		used->idx++;
413 		idx++;
414 	}
415 
416 	vq_info->last_avail = idx;
417 	return (notify);
418 
419 reset:
420 	/*
421 	 * When setting the "needs reset" flag, the driver is notified
422 	 * via a configuration change interrupt.
423 	 */
424 	dev->cfg.device_status |= DEVICE_NEEDS_RESET;
425 	dev->cfg.isr_status |= VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
426 	return (1);
427 }
428 
429 static void
430 dev_dispatch_vm(int fd, short event, void *arg)
431 {
432 	struct virtio_dev	*dev = (struct virtio_dev *)arg;
433 	struct imsgev		*iev = &dev->async_iev;
434 	struct imsgbuf		*ibuf = &iev->ibuf;
435 	struct imsg	 	 imsg;
436 	ssize_t			 n = 0;
437 	int			 verbose;
438 
439 	if (event & EV_READ) {
440 		if ((n = imsgbuf_read(ibuf)) == -1)
441 			fatal("%s: imsgbuf_read", __func__);
442 		if (n == 0) {
443 			/* this pipe is dead, so remove the event handler */
444 			log_debug("%s: pipe dead (EV_READ)", __func__);
445 			event_del(&iev->ev);
446 			event_loopexit(NULL);
447 			return;
448 		}
449 	}
450 
451 	if (event & EV_WRITE) {
452 		if (imsgbuf_write(ibuf) == -1) {
453 			if (errno == EPIPE) {
454 				/* this pipe is dead, remove the handler */
455 				log_debug("%s: pipe dead (EV_WRITE)", __func__);
456 				event_del(&iev->ev);
457 				event_loopexit(NULL);
458 				return;
459 			}
460 			fatal("%s: imsgbuf_write", __func__);
461 		}
462 	}
463 
464 	for (;;) {
465 		if ((n = imsg_get(ibuf, &imsg)) == -1)
466 			fatal("%s: imsg_get", __func__);
467 		if (n == 0)
468 			break;
469 
470 		switch (imsg.hdr.type) {
471 		case IMSG_VMDOP_PAUSE_VM:
472 			log_debug("%s: pausing", __func__);
473 			break;
474 		case IMSG_VMDOP_UNPAUSE_VM:
475 			log_debug("%s: unpausing", __func__);
476 			break;
477 		case IMSG_CTL_VERBOSE:
478 			IMSG_SIZE_CHECK(&imsg, &verbose);
479 			memcpy(&verbose, imsg.data, sizeof(verbose));
480 			log_setverbose(verbose);
481 			break;
482 		default:
483 			log_warnx("%s: unhandled imsg type %d", __func__,
484 			    imsg.hdr.type);
485 			break;
486 		}
487 		imsg_free(&imsg);
488 	}
489 	imsg_event_add(iev);
490 }
491 
492 /*
493  * Synchronous IO handler.
494  *
495  */
496 static void
497 handle_sync_io(int fd, short event, void *arg)
498 {
499 	struct virtio_dev *dev = (struct virtio_dev *)arg;
500 	struct imsgev *iev = &dev->sync_iev;
501 	struct imsgbuf *ibuf = &iev->ibuf;
502 	struct viodev_msg msg;
503 	struct imsg imsg;
504 	ssize_t n;
505 	int8_t intr = INTR_STATE_NOOP;
506 
507 	if (event & EV_READ) {
508 		if ((n = imsgbuf_read(ibuf)) == -1)
509 			fatal("%s: imsgbuf_read", __func__);
510 		if (n == 0) {
511 			/* this pipe is dead, so remove the event handler */
512 			log_debug("%s: vioblk pipe dead (EV_READ)", __func__);
513 			event_del(&iev->ev);
514 			event_loopexit(NULL);
515 			return;
516 		}
517 	}
518 
519 	if (event & EV_WRITE) {
520 		if (imsgbuf_write(ibuf) == -1) {
521 			if (errno == EPIPE) {
522 				/* this pipe is dead, remove the handler */
523 				log_debug("%s: pipe dead (EV_WRITE)", __func__);
524 				event_del(&iev->ev);
525 				event_loopexit(NULL);
526 				return;
527 			}
528 			fatal("%s: imsgbuf_write", __func__);
529 		}
530 	}
531 
532 	for (;;) {
533 		if ((n = imsg_get(ibuf, &imsg)) == -1)
534 			fatalx("%s: imsg_get (n=%ld)", __func__, n);
535 		if (n == 0)
536 			break;
537 
538 		/* Unpack our message. They ALL should be dev messeges! */
539 		IMSG_SIZE_CHECK(&imsg, &msg);
540 		memcpy(&msg, imsg.data, sizeof(msg));
541 		imsg_free(&imsg);
542 
543 		switch (msg.type) {
544 		case VIODEV_MSG_DUMP:
545 			/* Dump device */
546 			n = atomicio(vwrite, dev->sync_fd, dev, sizeof(*dev));
547 			if (n != sizeof(*dev)) {
548 				log_warnx("%s: failed to dump vioblk device",
549 				    __func__);
550 				break;
551 			}
552 		case VIODEV_MSG_IO_READ:
553 			/* Read IO: make sure to send a reply */
554 			msg.data = handle_io_read(&msg, dev, &intr);
555 			msg.data_valid = 1;
556 			msg.state = intr;
557 			imsg_compose_event(iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
558 			    sizeof(msg));
559 			break;
560 		case VIODEV_MSG_IO_WRITE:
561 			/* Write IO: no reply needed */
562 			if (handle_io_write(&msg, dev) == 1)
563 				virtio_assert_irq(dev, 0);
564 			break;
565 		case VIODEV_MSG_SHUTDOWN:
566 			event_del(&dev->sync_iev.ev);
567 			event_loopbreak();
568 			return;
569 		default:
570 			fatalx("%s: invalid msg type %d", __func__, msg.type);
571 		}
572 	}
573 	imsg_event_add(iev);
574 }
575 
576 static int
577 handle_io_write(struct viodev_msg *msg, struct virtio_dev *dev)
578 {
579 	struct vioblk_dev *vioblk = &dev->vioblk;
580 	uint32_t data = msg->data;
581 	int intr = 0;
582 
583 	switch (msg->reg) {
584 	case VIRTIO_CONFIG_DEVICE_FEATURES:
585 	case VIRTIO_CONFIG_QUEUE_SIZE:
586 	case VIRTIO_CONFIG_ISR_STATUS:
587 		log_warnx("%s: illegal write %x to %s", __progname, data,
588 		    virtio_reg_name(msg->reg));
589 		break;
590 	case VIRTIO_CONFIG_GUEST_FEATURES:
591 		vioblk->cfg.guest_feature = data;
592 		break;
593 	case VIRTIO_CONFIG_QUEUE_PFN:
594 		vioblk->cfg.queue_pfn = data;
595 		vioblk_update_qa(vioblk);
596 		break;
597 	case VIRTIO_CONFIG_QUEUE_SELECT:
598 		vioblk->cfg.queue_select = data;
599 		vioblk_update_qs(vioblk);
600 		break;
601 	case VIRTIO_CONFIG_QUEUE_NOTIFY:
602 		/* XXX We should be stricter about status checks. */
603 		if (!(vioblk->cfg.device_status & DEVICE_NEEDS_RESET)) {
604 			vioblk->cfg.queue_notify = data;
605 			if (vioblk_notifyq(vioblk))
606 				intr = 1;
607 		}
608 		break;
609 	case VIRTIO_CONFIG_DEVICE_STATUS:
610 		vioblk->cfg.device_status = data;
611 		if (vioblk->cfg.device_status == 0) {
612 			vioblk->cfg.guest_feature = 0;
613 			vioblk->cfg.queue_pfn = 0;
614 			vioblk_update_qa(vioblk);
615 			vioblk->cfg.queue_size = 0;
616 			vioblk_update_qs(vioblk);
617 			vioblk->cfg.queue_select = 0;
618 			vioblk->cfg.queue_notify = 0;
619 			vioblk->cfg.isr_status = 0;
620 			vioblk->vq[0].last_avail = 0;
621 			vioblk->vq[0].notified_avail = 0;
622 			virtio_deassert_irq(dev, msg->vcpu);
623 		}
624 		break;
625 	default:
626 		break;
627 	}
628 	return (intr);
629 }
630 
631 static uint32_t
632 handle_io_read(struct viodev_msg *msg, struct virtio_dev *dev, int8_t *intr)
633 {
634 	struct vioblk_dev *vioblk = &dev->vioblk;
635 	uint8_t sz = msg->io_sz;
636 	uint32_t data;
637 
638 	if (msg->data_valid)
639 		data = msg->data;
640 	else
641 		data = 0;
642 
643 	switch (msg->reg) {
644 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
645 		switch (sz) {
646 		case 4:
647 			data = (uint32_t)(vioblk->capacity);
648 			break;
649 		case 2:
650 			data &= 0xFFFF0000;
651 			data |= (uint32_t)(vioblk->capacity) & 0xFFFF;
652 			break;
653 		case 1:
654 			data &= 0xFFFFFF00;
655 			data |= (uint32_t)(vioblk->capacity) & 0xFF;
656 			break;
657 		}
658 		/* XXX handle invalid sz */
659 		break;
660 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
661 		if (sz == 1) {
662 			data &= 0xFFFFFF00;
663 			data |= (uint32_t)(vioblk->capacity >> 8) & 0xFF;
664 		}
665 		/* XXX handle invalid sz */
666 		break;
667 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
668 		if (sz == 1) {
669 			data &= 0xFFFFFF00;
670 			data |= (uint32_t)(vioblk->capacity >> 16) & 0xFF;
671 		} else if (sz == 2) {
672 			data &= 0xFFFF0000;
673 			data |= (uint32_t)(vioblk->capacity >> 16) & 0xFFFF;
674 		}
675 		/* XXX handle invalid sz */
676 		break;
677 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
678 		if (sz == 1) {
679 			data &= 0xFFFFFF00;
680 			data |= (uint32_t)(vioblk->capacity >> 24) & 0xFF;
681 		}
682 		/* XXX handle invalid sz */
683 		break;
684 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
685 		switch (sz) {
686 		case 4:
687 			data = (uint32_t)(vioblk->capacity >> 32);
688 			break;
689 		case 2:
690 			data &= 0xFFFF0000;
691 			data |= (uint32_t)(vioblk->capacity >> 32) & 0xFFFF;
692 			break;
693 		case 1:
694 			data &= 0xFFFFFF00;
695 			data |= (uint32_t)(vioblk->capacity >> 32) & 0xFF;
696 			break;
697 		}
698 		/* XXX handle invalid sz */
699 		break;
700 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
701 		if (sz == 1) {
702 			data &= 0xFFFFFF00;
703 			data |= (uint32_t)(vioblk->capacity >> 40) & 0xFF;
704 		}
705 		/* XXX handle invalid sz */
706 		break;
707 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 6:
708 		if (sz == 1) {
709 			data &= 0xFFFFFF00;
710 			data |= (uint32_t)(vioblk->capacity >> 48) & 0xFF;
711 		} else if (sz == 2) {
712 			data &= 0xFFFF0000;
713 			data |= (uint32_t)(vioblk->capacity >> 48) & 0xFFFF;
714 		}
715 		/* XXX handle invalid sz */
716 		break;
717 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 7:
718 		if (sz == 1) {
719 			data &= 0xFFFFFF00;
720 			data |= (uint32_t)(vioblk->capacity >> 56) & 0xFF;
721 		}
722 		/* XXX handle invalid sz */
723 		break;
724 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12:
725 		switch (sz) {
726 		case 4:
727 			data = (uint32_t)(vioblk->seg_max);
728 			break;
729 		case 2:
730 			data &= 0xFFFF0000;
731 			data |= (uint32_t)(vioblk->seg_max) & 0xFFFF;
732 			break;
733 		case 1:
734 			data &= 0xFFFFFF00;
735 			data |= (uint32_t)(vioblk->seg_max) & 0xFF;
736 			break;
737 		}
738 		/* XXX handle invalid sz */
739 		break;
740 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 13:
741 		if (sz == 1) {
742 			data &= 0xFFFFFF00;
743 			data |= (uint32_t)(vioblk->seg_max >> 8) & 0xFF;
744 		}
745 		/* XXX handle invalid sz */
746 		break;
747 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 14:
748 		if (sz == 1) {
749 			data &= 0xFFFFFF00;
750 			data |= (uint32_t)(vioblk->seg_max >> 16) & 0xFF;
751 		} else if (sz == 2) {
752 			data &= 0xFFFF0000;
753 			data |= (uint32_t)(vioblk->seg_max >> 16)
754 			    & 0xFFFF;
755 		}
756 		/* XXX handle invalid sz */
757 		break;
758 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 15:
759 		if (sz == 1) {
760 			data &= 0xFFFFFF00;
761 			data |= (uint32_t)(vioblk->seg_max >> 24) & 0xFF;
762 		}
763 		/* XXX handle invalid sz */
764 		break;
765 	case VIRTIO_CONFIG_DEVICE_FEATURES:
766 		data = vioblk->cfg.device_feature;
767 		break;
768 	case VIRTIO_CONFIG_GUEST_FEATURES:
769 		data = vioblk->cfg.guest_feature;
770 		break;
771 	case VIRTIO_CONFIG_QUEUE_PFN:
772 		data = vioblk->cfg.queue_pfn;
773 		break;
774 	case VIRTIO_CONFIG_QUEUE_SIZE:
775 		data = vioblk->cfg.queue_size;
776 		break;
777 	case VIRTIO_CONFIG_QUEUE_SELECT:
778 		data = vioblk->cfg.queue_select;
779 		break;
780 	case VIRTIO_CONFIG_QUEUE_NOTIFY:
781 		data = vioblk->cfg.queue_notify;
782 		break;
783 	case VIRTIO_CONFIG_DEVICE_STATUS:
784 		data = vioblk->cfg.device_status;
785 		break;
786 	case VIRTIO_CONFIG_ISR_STATUS:
787 		data = vioblk->cfg.isr_status;
788 		vioblk->cfg.isr_status = 0;
789 		if (intr != NULL)
790 			*intr = INTR_STATE_DEASSERT;
791 		break;
792 	default:
793 		return (0xFFFFFFFF);
794 	}
795 
796 	return (data);
797 }
798 
799 /*
800  * Emulate read/write io. Walks the descriptor chain, collecting io work and
801  * then emulates the read or write.
802  *
803  * On success, returns bytes read/written.
804  * On error, returns -1 and descriptor (desc) remains at its current position.
805  */
806 static ssize_t
807 vioblk_rw(struct vioblk_dev *dev, int is_write, off_t offset,
808     struct vring_desc *desc_tbl, struct vring_desc **desc)
809 {
810 	struct iovec *iov = NULL;
811 	ssize_t sz = 0;
812 	size_t io_idx = 0;		/* Index into iovec workqueue. */
813 	size_t xfer_sz = 0;		/* Total accumulated io bytes. */
814 
815 	do {
816 		iov = &io_v[io_idx];
817 
818 		/*
819 		 * Reads require writable descriptors. Writes require
820 		 * non-writeable descriptors.
821 		 */
822 		if ((!is_write) ^ DESC_WRITABLE(*desc)) {
823 			log_warnx("%s: invalid descriptor for %s command",
824 			    __func__, is_write ? "write" : "read");
825 			return (-1);
826 		}
827 
828 		/* Collect the IO segment information. */
829 		iov->iov_len = (size_t)(*desc)->len;
830 		iov->iov_base = hvaddr_mem((*desc)->addr, iov->iov_len);
831 		if (iov->iov_base == NULL)
832 			return (-1);
833 
834 		/* Move our counters. */
835 		xfer_sz += iov->iov_len;
836 		io_idx++;
837 
838 		/* Guard against infinite chains */
839 		if (io_idx >= nitems(io_v)) {
840 			log_warnx("%s: descriptor table "
841 			    "invalid", __func__);
842 			return (-1);
843 		}
844 
845 		/* Advance to the next descriptor. */
846 		*desc = &desc_tbl[(*desc)->next & VIOBLK_QUEUE_MASK];
847 	} while ((*desc)->flags & VRING_DESC_F_NEXT);
848 
849 	/*
850 	 * Validate the requested block io operation alignment and size.
851 	 * Checking offset is just an extra caution as it is derived from
852 	 * a disk sector and is done for completeness in bounds checking.
853 	 */
854 	if (offset % VIRTIO_BLK_SECTOR_SIZE != 0 &&
855 	    xfer_sz % VIRTIO_BLK_SECTOR_SIZE != 0) {
856 		log_warnx("%s: unaligned read", __func__);
857 		return (-1);
858 	}
859 	if (xfer_sz > SSIZE_MAX) {	/* iovec_copyin limit */
860 		log_warnx("%s: invalid %s size: %zu", __func__,
861 		    is_write ? "write" : "read", xfer_sz);
862 		return (-1);
863 	}
864 
865 	/* Emulate the Read or Write operation. */
866 	if (is_write)
867 		sz = dev->file.pwritev(dev->file.p, io_v, io_idx, offset);
868 	else
869 		sz = dev->file.preadv(dev->file.p, io_v, io_idx, offset);
870 	if (sz != (ssize_t)xfer_sz) {
871 		log_warnx("%s: %s failure at offset 0x%llx, xfer_sz=%zu, "
872 		    "sz=%ld", __func__, (is_write ? "write" : "read"), offset,
873 		    xfer_sz, sz);
874 		return (-1);
875 	}
876 
877 	return (sz);
878 }
879