xref: /openbsd-src/usr.sbin/vmd/vioblk.c (revision 8fbe85f50ddf75fc9249e5d62c94aa31cb6b07d3)
1*8fbe85f5Skirill /*	$OpenBSD: vioblk.c,v 1.21 2024/11/27 22:32:14 kirill Exp $	*/
23481ecdfSdv 
33481ecdfSdv /*
43481ecdfSdv  * Copyright (c) 2023 Dave Voutila <dv@openbsd.org>
53481ecdfSdv  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
63481ecdfSdv  *
73481ecdfSdv  * Permission to use, copy, modify, and distribute this software for any
83481ecdfSdv  * purpose with or without fee is hereby granted, provided that the above
93481ecdfSdv  * copyright notice and this permission notice appear in all copies.
103481ecdfSdv  *
113481ecdfSdv  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
123481ecdfSdv  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
133481ecdfSdv  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
143481ecdfSdv  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
153481ecdfSdv  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
163481ecdfSdv  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
173481ecdfSdv  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
183481ecdfSdv  */
1965bbee46Sjsg #include <stdint.h>
203481ecdfSdv 
213481ecdfSdv #include <dev/pci/virtio_pcireg.h>
223481ecdfSdv #include <dev/pv/vioblkreg.h>
233481ecdfSdv #include <dev/pv/virtioreg.h>
243481ecdfSdv 
253481ecdfSdv #include <errno.h>
263481ecdfSdv #include <event.h>
273481ecdfSdv #include <stdlib.h>
283481ecdfSdv #include <string.h>
293481ecdfSdv #include <unistd.h>
303481ecdfSdv 
313481ecdfSdv #include "atomicio.h"
323481ecdfSdv #include "pci.h"
333481ecdfSdv #include "virtio.h"
343481ecdfSdv #include "vmd.h"
353481ecdfSdv 
363481ecdfSdv extern char *__progname;
373481ecdfSdv extern struct vmd_vm *current_vm;
3820e554f8Sdv struct iovec io_v[VIOBLK_QUEUE_SIZE];
393481ecdfSdv 
403481ecdfSdv static const char *disk_type(int);
418761e6b4Sdv static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *,
428761e6b4Sdv     int8_t *);
433481ecdfSdv static int handle_io_write(struct viodev_msg *, struct virtio_dev *);
4420e554f8Sdv 
4520e554f8Sdv static void vioblk_update_qs(struct vioblk_dev *);
4620e554f8Sdv static void vioblk_update_qa(struct vioblk_dev *);
4720e554f8Sdv static int vioblk_notifyq(struct vioblk_dev *);
4820e554f8Sdv static ssize_t vioblk_rw(struct vioblk_dev *, int, off_t,
4920e554f8Sdv     struct vring_desc *, struct vring_desc **);
503481ecdfSdv 
513481ecdfSdv static void dev_dispatch_vm(int, short, void *);
523481ecdfSdv static void handle_sync_io(int, short, void *);
533481ecdfSdv 
543481ecdfSdv static const char *
553481ecdfSdv disk_type(int type)
563481ecdfSdv {
573481ecdfSdv 	switch (type) {
583481ecdfSdv 	case VMDF_RAW: return "raw";
593481ecdfSdv 	case VMDF_QCOW2: return "qcow2";
603481ecdfSdv 	}
613481ecdfSdv 	return "unknown";
623481ecdfSdv }
633481ecdfSdv 
643481ecdfSdv __dead void
653c817da7Sdv vioblk_main(int fd, int fd_vmm)
663481ecdfSdv {
673481ecdfSdv 	struct virtio_dev	 dev;
6808e69010Sjsg 	struct vioblk_dev	*vioblk = NULL;
693481ecdfSdv 	struct viodev_msg 	 msg;
703481ecdfSdv 	struct vmd_vm		 vm;
713481ecdfSdv 	struct vm_create_params	*vcp;
723481ecdfSdv 	ssize_t			 sz;
733481ecdfSdv 	off_t			 szp = 0;
743481ecdfSdv 	int			 i, ret, type;
753481ecdfSdv 
763c817da7Sdv 	/*
773c817da7Sdv 	 * stdio - needed for read/write to disk fds and channels to the vm.
783c817da7Sdv 	 * vmm + proc - needed to create shared vm mappings.
793c817da7Sdv 	 */
803c817da7Sdv 	if (pledge("stdio vmm proc", NULL) == -1)
813481ecdfSdv 		fatal("pledge");
823481ecdfSdv 
8320e554f8Sdv 	/* Zero and initialize io work queue. */
8420e554f8Sdv 	memset(io_v, 0, nitems(io_v)*sizeof(io_v[0]));
8520e554f8Sdv 
863481ecdfSdv 	/* Receive our virtio_dev, mostly preconfigured. */
873481ecdfSdv 	memset(&dev, 0, sizeof(dev));
883481ecdfSdv 	sz = atomicio(read, fd, &dev, sizeof(dev));
893481ecdfSdv 	if (sz != sizeof(dev)) {
903481ecdfSdv 		ret = errno;
91d074e402Sdv 		log_warn("failed to receive vioblk");
923481ecdfSdv 		goto fail;
933481ecdfSdv 	}
943481ecdfSdv 	if (dev.dev_type != VMD_DEVTYPE_DISK) {
953481ecdfSdv 		ret = EINVAL;
963481ecdfSdv 		log_warn("received invalid device type");
973481ecdfSdv 		goto fail;
983481ecdfSdv 	}
993481ecdfSdv 	dev.sync_fd = fd;
1003481ecdfSdv 	vioblk = &dev.vioblk;
1013481ecdfSdv 
1023481ecdfSdv 	log_debug("%s: got viblk dev. num disk fds = %d, sync fd = %d, "
10320e554f8Sdv 	    "async fd = %d, capacity = %lld seg_max = %u, vmm fd = %d",
10420e554f8Sdv 	    __func__, vioblk->ndisk_fd, dev.sync_fd, dev.async_fd,
10520e554f8Sdv 	    vioblk->capacity, vioblk->seg_max, fd_vmm);
1063481ecdfSdv 
1073481ecdfSdv 	/* Receive our vm information from the vm process. */
1083481ecdfSdv 	memset(&vm, 0, sizeof(vm));
1093481ecdfSdv 	sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm));
1103481ecdfSdv 	if (sz != sizeof(vm)) {
1113481ecdfSdv 		ret = EIO;
1123481ecdfSdv 		log_warnx("failed to receive vm details");
1133481ecdfSdv 		goto fail;
1143481ecdfSdv 	}
1153481ecdfSdv 	vcp = &vm.vm_params.vmc_params;
1163481ecdfSdv 	current_vm = &vm;
11708d0da61Sdv 
11808d0da61Sdv 	setproctitle("%s/vioblk%d", vcp->vcp_name, vioblk->idx);
11908d0da61Sdv 	log_procinit("vm/%s/vioblk%d", vcp->vcp_name, vioblk->idx);
1203481ecdfSdv 
1213481ecdfSdv 	/* Now that we have our vm information, we can remap memory. */
1223c817da7Sdv 	ret = remap_guest_mem(&vm, fd_vmm);
1233481ecdfSdv 	if (ret) {
1243481ecdfSdv 		log_warnx("failed to remap guest memory");
1253481ecdfSdv 		goto fail;
1263481ecdfSdv 	}
1273481ecdfSdv 
1283c817da7Sdv 	/*
1293c817da7Sdv 	 * We no longer need /dev/vmm access.
1303c817da7Sdv 	 */
1313c817da7Sdv 	close_fd(fd_vmm);
1323c817da7Sdv 	if (pledge("stdio", NULL) == -1)
1333c817da7Sdv 		fatal("pledge2");
1343c817da7Sdv 
1353481ecdfSdv 	/* Initialize the virtio block abstractions. */
1363481ecdfSdv 	type = vm.vm_params.vmc_disktypes[vioblk->idx];
1373481ecdfSdv 	switch (type) {
1383481ecdfSdv 	case VMDF_RAW:
1393481ecdfSdv 		ret = virtio_raw_init(&vioblk->file, &szp, vioblk->disk_fd,
1403481ecdfSdv 		    vioblk->ndisk_fd);
1413481ecdfSdv 		break;
1423481ecdfSdv 	case VMDF_QCOW2:
1433481ecdfSdv 		ret = virtio_qcow2_init(&vioblk->file, &szp, vioblk->disk_fd,
1443481ecdfSdv 		    vioblk->ndisk_fd);
1453481ecdfSdv 		break;
1463481ecdfSdv 	default:
1473481ecdfSdv 		log_warnx("invalid disk image type");
1483481ecdfSdv 		goto fail;
1493481ecdfSdv 	}
1503481ecdfSdv 	if (ret || szp < 0) {
1513481ecdfSdv 		log_warnx("failed to init disk %s image", disk_type(type));
1523481ecdfSdv 		goto fail;
1533481ecdfSdv 	}
15420e554f8Sdv 	vioblk->capacity = szp / 512;
15508d0da61Sdv 	log_debug("%s: initialized vioblk%d with %s image (capacity=%lld)",
15620e554f8Sdv 	    __func__, vioblk->idx, disk_type(type), vioblk->capacity);
1573481ecdfSdv 
1583481ecdfSdv 	/* If we're restoring hardware, reinitialize the virtqueue hva. */
1593481ecdfSdv 	if (vm.vm_state & VM_STATE_RECEIVED)
1603481ecdfSdv 		vioblk_update_qa(vioblk);
1613481ecdfSdv 
1623481ecdfSdv 	/* Initialize libevent so we can start wiring event handlers. */
1633481ecdfSdv 	event_init();
1643481ecdfSdv 
1653481ecdfSdv 	/* Wire up an async imsg channel. */
1663481ecdfSdv 	log_debug("%s: wiring in async vm event handler (fd=%d)", __func__,
1673481ecdfSdv 		dev.async_fd);
168a246f7a0Sdv 	if (vm_device_pipe(&dev, dev_dispatch_vm, NULL)) {
1693481ecdfSdv 		ret = EIO;
1703481ecdfSdv 		log_warnx("vm_device_pipe");
1713481ecdfSdv 		goto fail;
1723481ecdfSdv 	}
1733481ecdfSdv 
1743481ecdfSdv 	/* Configure our sync channel event handler. */
1753481ecdfSdv 	log_debug("%s: wiring in sync channel handler (fd=%d)", __func__,
1763481ecdfSdv 		dev.sync_fd);
1770a9d031fSclaudio 	if (imsgbuf_init(&dev.sync_iev.ibuf, dev.sync_fd) == -1) {
1780a9d031fSclaudio 		log_warn("imsgbuf_init");
1790a9d031fSclaudio 		goto fail;
1800a9d031fSclaudio 	}
1810a9d031fSclaudio 	imsgbuf_allow_fdpass(&dev.sync_iev.ibuf);
1823481ecdfSdv 	dev.sync_iev.handler = handle_sync_io;
1833481ecdfSdv 	dev.sync_iev.data = &dev;
1843481ecdfSdv 	dev.sync_iev.events = EV_READ;
1853481ecdfSdv 	imsg_event_add(&dev.sync_iev);
1863481ecdfSdv 
1873481ecdfSdv 	/* Send a ready message over the sync channel. */
1883481ecdfSdv 	log_debug("%s: telling vm %s device is ready", __func__, vcp->vcp_name);
1893481ecdfSdv 	memset(&msg, 0, sizeof(msg));
1903481ecdfSdv 	msg.type = VIODEV_MSG_READY;
1913481ecdfSdv 	imsg_compose_event(&dev.sync_iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
1923481ecdfSdv 	    sizeof(msg));
1933481ecdfSdv 
1943481ecdfSdv 	/* Send a ready message over the async channel. */
1953481ecdfSdv 	log_debug("%s: sending heartbeat", __func__);
1963481ecdfSdv 	ret = imsg_compose_event(&dev.async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
1973481ecdfSdv 	    &msg, sizeof(msg));
1983481ecdfSdv 	if (ret == -1) {
1993481ecdfSdv 		log_warnx("%s: failed to send async ready message!", __func__);
2003481ecdfSdv 		goto fail;
2013481ecdfSdv 	}
2023481ecdfSdv 
2033481ecdfSdv 	/* Engage the event loop! */
2043481ecdfSdv 	ret = event_dispatch();
2053481ecdfSdv 
2063481ecdfSdv 	if (ret == 0) {
2073481ecdfSdv 		/* Clean shutdown. */
2083481ecdfSdv 		close_fd(dev.sync_fd);
2093481ecdfSdv 		close_fd(dev.async_fd);
21008e69010Sjsg 		for (i = 0; i < vioblk->ndisk_fd; i++)
2113481ecdfSdv 			close_fd(vioblk->disk_fd[i]);
2123481ecdfSdv 		_exit(0);
2133481ecdfSdv 		/* NOTREACHED */
2143481ecdfSdv 	}
2153481ecdfSdv 
2163481ecdfSdv fail:
2173481ecdfSdv 	/* Try letting the vm know we've failed something. */
2183481ecdfSdv 	memset(&msg, 0, sizeof(msg));
2193481ecdfSdv 	msg.type = VIODEV_MSG_ERROR;
2203481ecdfSdv 	msg.data = ret;
2213481ecdfSdv 	imsg_compose(&dev.sync_iev.ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
2223481ecdfSdv 	    sizeof(msg));
223dd7efffeSclaudio 	imsgbuf_flush(&dev.sync_iev.ibuf);
2243481ecdfSdv 
2253481ecdfSdv 	close_fd(dev.sync_fd);
2263481ecdfSdv 	close_fd(dev.async_fd);
22708e69010Sjsg 	if (vioblk != NULL) {
22808e69010Sjsg 		for (i = 0; i < vioblk->ndisk_fd; i++)
2293481ecdfSdv 			close_fd(vioblk->disk_fd[i]);
23008e69010Sjsg 	}
2313481ecdfSdv 	_exit(ret);
2323481ecdfSdv 	/* NOTREACHED */
2333481ecdfSdv }
2343481ecdfSdv 
2353481ecdfSdv const char *
2363481ecdfSdv vioblk_cmd_name(uint32_t type)
2373481ecdfSdv {
2383481ecdfSdv 	switch (type) {
2393481ecdfSdv 	case VIRTIO_BLK_T_IN: return "read";
2403481ecdfSdv 	case VIRTIO_BLK_T_OUT: return "write";
2413481ecdfSdv 	case VIRTIO_BLK_T_SCSI_CMD: return "scsi read";
2423481ecdfSdv 	case VIRTIO_BLK_T_SCSI_CMD_OUT: return "scsi write";
2433481ecdfSdv 	case VIRTIO_BLK_T_FLUSH: return "flush";
2443481ecdfSdv 	case VIRTIO_BLK_T_FLUSH_OUT: return "flush out";
2453481ecdfSdv 	case VIRTIO_BLK_T_GET_ID: return "get id";
2463481ecdfSdv 	default: return "unknown";
2473481ecdfSdv 	}
2483481ecdfSdv }
2493481ecdfSdv 
25020e554f8Sdv static void
2513481ecdfSdv vioblk_update_qa(struct vioblk_dev *dev)
2523481ecdfSdv {
2533481ecdfSdv 	struct virtio_vq_info *vq_info;
2543481ecdfSdv 	void *hva = NULL;
2553481ecdfSdv 
2563481ecdfSdv 	/* Invalid queue? */
2573481ecdfSdv 	if (dev->cfg.queue_select > 0)
2583481ecdfSdv 		return;
2593481ecdfSdv 
2603481ecdfSdv 	vq_info = &dev->vq[dev->cfg.queue_select];
2613481ecdfSdv 	vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE;
2623481ecdfSdv 
2633481ecdfSdv 	hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIOBLK_QUEUE_SIZE));
2643481ecdfSdv 	if (hva == NULL)
2653481ecdfSdv 		fatal("vioblk_update_qa");
2663481ecdfSdv 	vq_info->q_hva = hva;
2673481ecdfSdv }
2683481ecdfSdv 
26920e554f8Sdv static void
2703481ecdfSdv vioblk_update_qs(struct vioblk_dev *dev)
2713481ecdfSdv {
2723481ecdfSdv 	struct virtio_vq_info *vq_info;
2733481ecdfSdv 
2743481ecdfSdv 	/* Invalid queue? */
2753481ecdfSdv 	if (dev->cfg.queue_select > 0) {
2763481ecdfSdv 		dev->cfg.queue_size = 0;
2773481ecdfSdv 		return;
2783481ecdfSdv 	}
2793481ecdfSdv 
2803481ecdfSdv 	vq_info = &dev->vq[dev->cfg.queue_select];
2813481ecdfSdv 
2823481ecdfSdv 	/* Update queue pfn/size based on queue select */
2833481ecdfSdv 	dev->cfg.queue_pfn = vq_info->q_gpa >> 12;
2843481ecdfSdv 	dev->cfg.queue_size = vq_info->qs;
2853481ecdfSdv }
2863481ecdfSdv 
2873481ecdfSdv /*
28820e554f8Sdv  * Process virtqueue notifications. If an unrecoverable error occurs, puts
28920e554f8Sdv  * device into a "needs reset" state.
29020e554f8Sdv  *
29120e554f8Sdv  * Returns 1 if an we need to assert an IRQ.
2923481ecdfSdv  */
29320e554f8Sdv static int
2943481ecdfSdv vioblk_notifyq(struct vioblk_dev *dev)
2953481ecdfSdv {
29620e554f8Sdv 	uint32_t cmd_len;
29720e554f8Sdv 	uint16_t idx, cmd_desc_idx;
2983481ecdfSdv 	uint8_t ds;
29920e554f8Sdv 	off_t offset;
30020e554f8Sdv 	ssize_t sz;
301f0a11786Sdv 	int is_write, notify = 0, i;
3023481ecdfSdv 	char *vr;
30320e554f8Sdv 	struct vring_desc *table, *desc;
3043481ecdfSdv 	struct vring_avail *avail;
3053481ecdfSdv 	struct vring_used *used;
30620e554f8Sdv 	struct virtio_blk_req_hdr *cmd;
3073481ecdfSdv 	struct virtio_vq_info *vq_info;
3083481ecdfSdv 
3093481ecdfSdv 	/* Invalid queue? */
3103481ecdfSdv 	if (dev->cfg.queue_notify > 0)
3113481ecdfSdv 		return (0);
3123481ecdfSdv 
3133481ecdfSdv 	vq_info = &dev->vq[dev->cfg.queue_notify];
31420e554f8Sdv 	idx = vq_info->last_avail;
3153481ecdfSdv 	vr = vq_info->q_hva;
3163481ecdfSdv 	if (vr == NULL)
3173481ecdfSdv 		fatalx("%s: null vring", __func__);
3183481ecdfSdv 
31920e554f8Sdv 	/* Compute offsets in table of descriptors, avail ring, and used ring */
32020e554f8Sdv 	table = (struct vring_desc *)(vr);
3213481ecdfSdv 	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
3223481ecdfSdv 	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
3233481ecdfSdv 
32420e554f8Sdv 	while (idx != avail->idx) {
32520e554f8Sdv 		/* Retrieve Command descriptor. */
32620e554f8Sdv 		cmd_desc_idx = avail->ring[idx & VIOBLK_QUEUE_MASK];
32720e554f8Sdv 		desc = &table[cmd_desc_idx];
32820e554f8Sdv 		cmd_len = desc->len;
3293481ecdfSdv 
33020e554f8Sdv 		/*
33120e554f8Sdv 		 * Validate Command descriptor. It should be chained to another
33220e554f8Sdv 		 * descriptor and not be itself writable.
33320e554f8Sdv 		 */
33420e554f8Sdv 		if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
33520e554f8Sdv 			log_warnx("%s: unchained cmd descriptor", __func__);
33620e554f8Sdv 			goto reset;
33720e554f8Sdv 		}
33820e554f8Sdv 		if (DESC_WRITABLE(desc)) {
33920e554f8Sdv 			log_warnx("%s: invalid cmd descriptor state", __func__);
34020e554f8Sdv 			goto reset;
3413481ecdfSdv 		}
3423481ecdfSdv 
34320e554f8Sdv 		/* Retrieve the vioblk command request. */
34420e554f8Sdv 		cmd = hvaddr_mem(desc->addr, sizeof(*cmd));
34520e554f8Sdv 		if (cmd == NULL)
34620e554f8Sdv 			goto reset;
3473481ecdfSdv 
34820e554f8Sdv 		/* Advance to the 2nd descriptor. */
34920e554f8Sdv 		desc = &table[desc->next & VIOBLK_QUEUE_MASK];
3503481ecdfSdv 
35120e554f8Sdv 		/* Process each available command & chain. */
35220e554f8Sdv 		switch (cmd->type) {
3533481ecdfSdv 		case VIRTIO_BLK_T_IN:
3543481ecdfSdv 		case VIRTIO_BLK_T_OUT:
35520e554f8Sdv 			/* Read (IN) & Write (OUT) */
35620e554f8Sdv 			is_write = (cmd->type == VIRTIO_BLK_T_OUT) ? 1 : 0;
35720e554f8Sdv 			offset = cmd->sector * VIRTIO_BLK_SECTOR_SIZE;
35820e554f8Sdv 			sz = vioblk_rw(dev, is_write, offset, table, &desc);
35920e554f8Sdv 			if (sz == -1)
36020e554f8Sdv 				ds = VIRTIO_BLK_S_IOERR;
36120e554f8Sdv 			else
3623481ecdfSdv 				ds = VIRTIO_BLK_S_OK;
3633481ecdfSdv 			break;
3643481ecdfSdv 		case VIRTIO_BLK_T_GET_ID:
3653481ecdfSdv 			/*
3663481ecdfSdv 			 * We don't support this command yet. While it's not
3673481ecdfSdv 			 * officially part of the virtio spec (will be in v1.2)
3683481ecdfSdv 			 * there's no feature to negotiate. Linux drivers will
3693481ecdfSdv 			 * often send this command regardless.
3703481ecdfSdv 			 */
3713481ecdfSdv 			ds = VIRTIO_BLK_S_UNSUPP;
372*8fbe85f5Skirill 			break;
3733481ecdfSdv 		default:
37420e554f8Sdv 			log_warnx("%s: unsupported vioblk command %d", __func__,
37520e554f8Sdv 			    cmd->type);
3763481ecdfSdv 			ds = VIRTIO_BLK_S_UNSUPP;
3773481ecdfSdv 			break;
3783481ecdfSdv 		}
3793481ecdfSdv 
38020e554f8Sdv 		/* Advance to the end of the chain, if needed. */
38120e554f8Sdv 		i = 0;
38220e554f8Sdv 		while (desc->flags & VRING_DESC_F_NEXT) {
38320e554f8Sdv 			desc = &table[desc->next & VIOBLK_QUEUE_MASK];
38420e554f8Sdv 			if (++i >= VIOBLK_QUEUE_SIZE) {
38520e554f8Sdv 				/*
38620e554f8Sdv 				 * If we encounter an infinite/looping chain,
38720e554f8Sdv 				 * not much we can do but say we need a reset.
38820e554f8Sdv 				 */
38920e554f8Sdv 				log_warnx("%s: descriptor chain overflow",
39020e554f8Sdv 				    __func__);
39120e554f8Sdv 				goto reset;
3923481ecdfSdv 			}
3933481ecdfSdv 		}
3943481ecdfSdv 
39520e554f8Sdv 		/* Provide the status of our command processing. */
39620e554f8Sdv 		if (!DESC_WRITABLE(desc)) {
39720e554f8Sdv 			log_warnx("%s: status descriptor unwritable", __func__);
39820e554f8Sdv 			goto reset;
39920e554f8Sdv 		}
40020e554f8Sdv 		/* Overkill as ds is 1 byte, but validates gpa. */
40120e554f8Sdv 		if (write_mem(desc->addr, &ds, sizeof(ds)))
40220e554f8Sdv 			log_warnx("%s: can't write device status data "
40320e554f8Sdv 			    "@ 0x%llx",__func__, desc->addr);
40420e554f8Sdv 
40520e554f8Sdv 		dev->cfg.isr_status |= 1;
40620e554f8Sdv 		notify = 1;
40720e554f8Sdv 
4083481ecdfSdv 		used->ring[used->idx & VIOBLK_QUEUE_MASK].id = cmd_desc_idx;
40920e554f8Sdv 		used->ring[used->idx & VIOBLK_QUEUE_MASK].len = cmd_len;
41020e554f8Sdv 
4113481ecdfSdv 		__sync_synchronize();
4123481ecdfSdv 		used->idx++;
41320e554f8Sdv 		idx++;
4143481ecdfSdv 	}
41520e554f8Sdv 
41620e554f8Sdv 	vq_info->last_avail = idx;
41720e554f8Sdv 	return (notify);
41820e554f8Sdv 
41920e554f8Sdv reset:
42020e554f8Sdv 	/*
42120e554f8Sdv 	 * When setting the "needs reset" flag, the driver is notified
42220e554f8Sdv 	 * via a configuration change interrupt.
42320e554f8Sdv 	 */
42420e554f8Sdv 	dev->cfg.device_status |= DEVICE_NEEDS_RESET;
42520e554f8Sdv 	dev->cfg.isr_status |= VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
4263481ecdfSdv 	return (1);
4273481ecdfSdv }
4283481ecdfSdv 
4293481ecdfSdv static void
4303481ecdfSdv dev_dispatch_vm(int fd, short event, void *arg)
4313481ecdfSdv {
4323481ecdfSdv 	struct virtio_dev	*dev = (struct virtio_dev *)arg;
4333481ecdfSdv 	struct imsgev		*iev = &dev->async_iev;
4343481ecdfSdv 	struct imsgbuf		*ibuf = &iev->ibuf;
4353481ecdfSdv 	struct imsg	 	 imsg;
4363481ecdfSdv 	ssize_t			 n = 0;
43708d0da61Sdv 	int			 verbose;
4383481ecdfSdv 
4393481ecdfSdv 	if (event & EV_READ) {
440d12ef5f3Sclaudio 		if ((n = imsgbuf_read(ibuf)) == -1)
441dd7efffeSclaudio 			fatal("%s: imsgbuf_read", __func__);
4423481ecdfSdv 		if (n == 0) {
4433481ecdfSdv 			/* this pipe is dead, so remove the event handler */
4443481ecdfSdv 			log_debug("%s: pipe dead (EV_READ)", __func__);
4453481ecdfSdv 			event_del(&iev->ev);
4463481ecdfSdv 			event_loopexit(NULL);
4473481ecdfSdv 			return;
4483481ecdfSdv 		}
4493481ecdfSdv 	}
4503481ecdfSdv 
4513481ecdfSdv 	if (event & EV_WRITE) {
452dd7efffeSclaudio 		if (imsgbuf_write(ibuf) == -1) {
453c1aa9554Sclaudio 			if (errno == EPIPE) {
454c1aa9554Sclaudio 				/* this pipe is dead, remove the handler */
4553481ecdfSdv 				log_debug("%s: pipe dead (EV_WRITE)", __func__);
4563481ecdfSdv 				event_del(&iev->ev);
457c1aa9554Sclaudio 				event_loopexit(NULL);
4583481ecdfSdv 				return;
4593481ecdfSdv 			}
460dd7efffeSclaudio 			fatal("%s: imsgbuf_write", __func__);
461c1aa9554Sclaudio 		}
4623481ecdfSdv 	}
4633481ecdfSdv 
4643481ecdfSdv 	for (;;) {
4653481ecdfSdv 		if ((n = imsg_get(ibuf, &imsg)) == -1)
4663481ecdfSdv 			fatal("%s: imsg_get", __func__);
4673481ecdfSdv 		if (n == 0)
4683481ecdfSdv 			break;
4693481ecdfSdv 
4703481ecdfSdv 		switch (imsg.hdr.type) {
4713481ecdfSdv 		case IMSG_VMDOP_PAUSE_VM:
4723481ecdfSdv 			log_debug("%s: pausing", __func__);
4733481ecdfSdv 			break;
4743481ecdfSdv 		case IMSG_VMDOP_UNPAUSE_VM:
4753481ecdfSdv 			log_debug("%s: unpausing", __func__);
4763481ecdfSdv 			break;
47708d0da61Sdv 		case IMSG_CTL_VERBOSE:
47808d0da61Sdv 			IMSG_SIZE_CHECK(&imsg, &verbose);
47908d0da61Sdv 			memcpy(&verbose, imsg.data, sizeof(verbose));
48008d0da61Sdv 			log_setverbose(verbose);
48108d0da61Sdv 			break;
4823481ecdfSdv 		default:
4833481ecdfSdv 			log_warnx("%s: unhandled imsg type %d", __func__,
4843481ecdfSdv 			    imsg.hdr.type);
4853481ecdfSdv 			break;
4863481ecdfSdv 		}
4873481ecdfSdv 		imsg_free(&imsg);
4883481ecdfSdv 	}
4893481ecdfSdv 	imsg_event_add(iev);
4903481ecdfSdv }
4913481ecdfSdv 
4923481ecdfSdv /*
4933481ecdfSdv  * Synchronous IO handler.
4943481ecdfSdv  *
4953481ecdfSdv  */
4963481ecdfSdv static void
4973481ecdfSdv handle_sync_io(int fd, short event, void *arg)
4983481ecdfSdv {
4993481ecdfSdv 	struct virtio_dev *dev = (struct virtio_dev *)arg;
5003481ecdfSdv 	struct imsgev *iev = &dev->sync_iev;
5013481ecdfSdv 	struct imsgbuf *ibuf = &iev->ibuf;
5023481ecdfSdv 	struct viodev_msg msg;
5033481ecdfSdv 	struct imsg imsg;
5043481ecdfSdv 	ssize_t n;
5058761e6b4Sdv 	int8_t intr = INTR_STATE_NOOP;
5063481ecdfSdv 
5073481ecdfSdv 	if (event & EV_READ) {
508d12ef5f3Sclaudio 		if ((n = imsgbuf_read(ibuf)) == -1)
509dd7efffeSclaudio 			fatal("%s: imsgbuf_read", __func__);
5103481ecdfSdv 		if (n == 0) {
5113481ecdfSdv 			/* this pipe is dead, so remove the event handler */
5123481ecdfSdv 			log_debug("%s: vioblk pipe dead (EV_READ)", __func__);
5133481ecdfSdv 			event_del(&iev->ev);
5143481ecdfSdv 			event_loopexit(NULL);
5153481ecdfSdv 			return;
5163481ecdfSdv 		}
5173481ecdfSdv 	}
5183481ecdfSdv 
5193481ecdfSdv 	if (event & EV_WRITE) {
520dd7efffeSclaudio 		if (imsgbuf_write(ibuf) == -1) {
521c1aa9554Sclaudio 			if (errno == EPIPE) {
522c1aa9554Sclaudio 				/* this pipe is dead, remove the handler */
523c1aa9554Sclaudio 				log_debug("%s: pipe dead (EV_WRITE)", __func__);
5243481ecdfSdv 				event_del(&iev->ev);
5253481ecdfSdv 				event_loopexit(NULL);
5263481ecdfSdv 				return;
5273481ecdfSdv 			}
528dd7efffeSclaudio 			fatal("%s: imsgbuf_write", __func__);
529c1aa9554Sclaudio 		}
5303481ecdfSdv 	}
5313481ecdfSdv 
5323481ecdfSdv 	for (;;) {
5333481ecdfSdv 		if ((n = imsg_get(ibuf, &imsg)) == -1)
5343481ecdfSdv 			fatalx("%s: imsg_get (n=%ld)", __func__, n);
5353481ecdfSdv 		if (n == 0)
5363481ecdfSdv 			break;
5373481ecdfSdv 
5383481ecdfSdv 		/* Unpack our message. They ALL should be dev messeges! */
5393481ecdfSdv 		IMSG_SIZE_CHECK(&imsg, &msg);
5403481ecdfSdv 		memcpy(&msg, imsg.data, sizeof(msg));
5413481ecdfSdv 		imsg_free(&imsg);
5423481ecdfSdv 
5433481ecdfSdv 		switch (msg.type) {
5443481ecdfSdv 		case VIODEV_MSG_DUMP:
5453481ecdfSdv 			/* Dump device */
5463481ecdfSdv 			n = atomicio(vwrite, dev->sync_fd, dev, sizeof(*dev));
5473481ecdfSdv 			if (n != sizeof(*dev)) {
5483481ecdfSdv 				log_warnx("%s: failed to dump vioblk device",
5493481ecdfSdv 				    __func__);
5503481ecdfSdv 				break;
5513481ecdfSdv 			}
5523481ecdfSdv 		case VIODEV_MSG_IO_READ:
5533481ecdfSdv 			/* Read IO: make sure to send a reply */
5548761e6b4Sdv 			msg.data = handle_io_read(&msg, dev, &intr);
5553481ecdfSdv 			msg.data_valid = 1;
5568761e6b4Sdv 			msg.state = intr;
5573481ecdfSdv 			imsg_compose_event(iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
5583481ecdfSdv 			    sizeof(msg));
5593481ecdfSdv 			break;
5603481ecdfSdv 		case VIODEV_MSG_IO_WRITE:
5613481ecdfSdv 			/* Write IO: no reply needed */
5623481ecdfSdv 			if (handle_io_write(&msg, dev) == 1)
563c4fd4c5bSdv 				virtio_assert_irq(dev, 0);
5643481ecdfSdv 			break;
5653481ecdfSdv 		case VIODEV_MSG_SHUTDOWN:
5663481ecdfSdv 			event_del(&dev->sync_iev.ev);
5673481ecdfSdv 			event_loopbreak();
5683481ecdfSdv 			return;
5693481ecdfSdv 		default:
5703481ecdfSdv 			fatalx("%s: invalid msg type %d", __func__, msg.type);
5713481ecdfSdv 		}
5723481ecdfSdv 	}
5733481ecdfSdv 	imsg_event_add(iev);
5743481ecdfSdv }
5753481ecdfSdv 
5763481ecdfSdv static int
5773481ecdfSdv handle_io_write(struct viodev_msg *msg, struct virtio_dev *dev)
5783481ecdfSdv {
5793481ecdfSdv 	struct vioblk_dev *vioblk = &dev->vioblk;
5803481ecdfSdv 	uint32_t data = msg->data;
5813481ecdfSdv 	int intr = 0;
5823481ecdfSdv 
5833481ecdfSdv 	switch (msg->reg) {
5843481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_FEATURES:
5853481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_SIZE:
5863481ecdfSdv 	case VIRTIO_CONFIG_ISR_STATUS:
5873481ecdfSdv 		log_warnx("%s: illegal write %x to %s", __progname, data,
5883481ecdfSdv 		    virtio_reg_name(msg->reg));
5893481ecdfSdv 		break;
5903481ecdfSdv 	case VIRTIO_CONFIG_GUEST_FEATURES:
5913481ecdfSdv 		vioblk->cfg.guest_feature = data;
5923481ecdfSdv 		break;
5933481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_PFN:
5943481ecdfSdv 		vioblk->cfg.queue_pfn = data;
5953481ecdfSdv 		vioblk_update_qa(vioblk);
5963481ecdfSdv 		break;
5973481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_SELECT:
5983481ecdfSdv 		vioblk->cfg.queue_select = data;
5993481ecdfSdv 		vioblk_update_qs(vioblk);
6003481ecdfSdv 		break;
6013481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_NOTIFY:
60220e554f8Sdv 		/* XXX We should be stricter about status checks. */
60320e554f8Sdv 		if (!(vioblk->cfg.device_status & DEVICE_NEEDS_RESET)) {
6043481ecdfSdv 			vioblk->cfg.queue_notify = data;
6053481ecdfSdv 			if (vioblk_notifyq(vioblk))
6063481ecdfSdv 				intr = 1;
60720e554f8Sdv 		}
6083481ecdfSdv 		break;
6093481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_STATUS:
6103481ecdfSdv 		vioblk->cfg.device_status = data;
6113481ecdfSdv 		if (vioblk->cfg.device_status == 0) {
6123481ecdfSdv 			vioblk->cfg.guest_feature = 0;
6133481ecdfSdv 			vioblk->cfg.queue_pfn = 0;
6143481ecdfSdv 			vioblk_update_qa(vioblk);
6153481ecdfSdv 			vioblk->cfg.queue_size = 0;
6163481ecdfSdv 			vioblk_update_qs(vioblk);
6173481ecdfSdv 			vioblk->cfg.queue_select = 0;
6183481ecdfSdv 			vioblk->cfg.queue_notify = 0;
6193481ecdfSdv 			vioblk->cfg.isr_status = 0;
6203481ecdfSdv 			vioblk->vq[0].last_avail = 0;
6213481ecdfSdv 			vioblk->vq[0].notified_avail = 0;
622c4fd4c5bSdv 			virtio_deassert_irq(dev, msg->vcpu);
6233481ecdfSdv 		}
6243481ecdfSdv 		break;
6253481ecdfSdv 	default:
6263481ecdfSdv 		break;
6273481ecdfSdv 	}
6283481ecdfSdv 	return (intr);
6293481ecdfSdv }
6303481ecdfSdv 
6313481ecdfSdv static uint32_t
6328761e6b4Sdv handle_io_read(struct viodev_msg *msg, struct virtio_dev *dev, int8_t *intr)
6333481ecdfSdv {
6343481ecdfSdv 	struct vioblk_dev *vioblk = &dev->vioblk;
6353481ecdfSdv 	uint8_t sz = msg->io_sz;
6363481ecdfSdv 	uint32_t data;
6373481ecdfSdv 
6383481ecdfSdv 	if (msg->data_valid)
6393481ecdfSdv 		data = msg->data;
6403481ecdfSdv 	else
6413481ecdfSdv 		data = 0;
6423481ecdfSdv 
6433481ecdfSdv 	switch (msg->reg) {
6443481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
6453481ecdfSdv 		switch (sz) {
6463481ecdfSdv 		case 4:
64720e554f8Sdv 			data = (uint32_t)(vioblk->capacity);
6483481ecdfSdv 			break;
6493481ecdfSdv 		case 2:
6503481ecdfSdv 			data &= 0xFFFF0000;
65120e554f8Sdv 			data |= (uint32_t)(vioblk->capacity) & 0xFFFF;
6523481ecdfSdv 			break;
6533481ecdfSdv 		case 1:
6543481ecdfSdv 			data &= 0xFFFFFF00;
65520e554f8Sdv 			data |= (uint32_t)(vioblk->capacity) & 0xFF;
6563481ecdfSdv 			break;
6573481ecdfSdv 		}
6583481ecdfSdv 		/* XXX handle invalid sz */
6593481ecdfSdv 		break;
6603481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
6613481ecdfSdv 		if (sz == 1) {
6623481ecdfSdv 			data &= 0xFFFFFF00;
66320e554f8Sdv 			data |= (uint32_t)(vioblk->capacity >> 8) & 0xFF;
6643481ecdfSdv 		}
6653481ecdfSdv 		/* XXX handle invalid sz */
6663481ecdfSdv 		break;
6673481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
6683481ecdfSdv 		if (sz == 1) {
6693481ecdfSdv 			data &= 0xFFFFFF00;
67020e554f8Sdv 			data |= (uint32_t)(vioblk->capacity >> 16) & 0xFF;
6713481ecdfSdv 		} else if (sz == 2) {
6723481ecdfSdv 			data &= 0xFFFF0000;
67320e554f8Sdv 			data |= (uint32_t)(vioblk->capacity >> 16) & 0xFFFF;
6743481ecdfSdv 		}
6753481ecdfSdv 		/* XXX handle invalid sz */
6763481ecdfSdv 		break;
6773481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
6783481ecdfSdv 		if (sz == 1) {
6793481ecdfSdv 			data &= 0xFFFFFF00;
68020e554f8Sdv 			data |= (uint32_t)(vioblk->capacity >> 24) & 0xFF;
6813481ecdfSdv 		}
6823481ecdfSdv 		/* XXX handle invalid sz */
6833481ecdfSdv 		break;
6843481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
6853481ecdfSdv 		switch (sz) {
6863481ecdfSdv 		case 4:
68720e554f8Sdv 			data = (uint32_t)(vioblk->capacity >> 32);
6883481ecdfSdv 			break;
6893481ecdfSdv 		case 2:
6903481ecdfSdv 			data &= 0xFFFF0000;
69120e554f8Sdv 			data |= (uint32_t)(vioblk->capacity >> 32) & 0xFFFF;
6923481ecdfSdv 			break;
6933481ecdfSdv 		case 1:
6943481ecdfSdv 			data &= 0xFFFFFF00;
69520e554f8Sdv 			data |= (uint32_t)(vioblk->capacity >> 32) & 0xFF;
6963481ecdfSdv 			break;
6973481ecdfSdv 		}
6983481ecdfSdv 		/* XXX handle invalid sz */
6993481ecdfSdv 		break;
7003481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
7013481ecdfSdv 		if (sz == 1) {
7023481ecdfSdv 			data &= 0xFFFFFF00;
70320e554f8Sdv 			data |= (uint32_t)(vioblk->capacity >> 40) & 0xFF;
7043481ecdfSdv 		}
7053481ecdfSdv 		/* XXX handle invalid sz */
7063481ecdfSdv 		break;
7073481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 6:
7083481ecdfSdv 		if (sz == 1) {
7093481ecdfSdv 			data &= 0xFFFFFF00;
71020e554f8Sdv 			data |= (uint32_t)(vioblk->capacity >> 48) & 0xFF;
7113481ecdfSdv 		} else if (sz == 2) {
7123481ecdfSdv 			data &= 0xFFFF0000;
71320e554f8Sdv 			data |= (uint32_t)(vioblk->capacity >> 48) & 0xFFFF;
7143481ecdfSdv 		}
7153481ecdfSdv 		/* XXX handle invalid sz */
7163481ecdfSdv 		break;
7173481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 7:
7183481ecdfSdv 		if (sz == 1) {
7193481ecdfSdv 			data &= 0xFFFFFF00;
72020e554f8Sdv 			data |= (uint32_t)(vioblk->capacity >> 56) & 0xFF;
7213481ecdfSdv 		}
7223481ecdfSdv 		/* XXX handle invalid sz */
7233481ecdfSdv 		break;
72420e554f8Sdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12:
7253481ecdfSdv 		switch (sz) {
7263481ecdfSdv 		case 4:
72720e554f8Sdv 			data = (uint32_t)(vioblk->seg_max);
7283481ecdfSdv 			break;
7293481ecdfSdv 		case 2:
7303481ecdfSdv 			data &= 0xFFFF0000;
73120e554f8Sdv 			data |= (uint32_t)(vioblk->seg_max) & 0xFFFF;
7323481ecdfSdv 			break;
7333481ecdfSdv 		case 1:
7343481ecdfSdv 			data &= 0xFFFFFF00;
73520e554f8Sdv 			data |= (uint32_t)(vioblk->seg_max) & 0xFF;
7363481ecdfSdv 			break;
7373481ecdfSdv 		}
7383481ecdfSdv 		/* XXX handle invalid sz */
7393481ecdfSdv 		break;
74020e554f8Sdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 13:
7413481ecdfSdv 		if (sz == 1) {
7423481ecdfSdv 			data &= 0xFFFFFF00;
74320e554f8Sdv 			data |= (uint32_t)(vioblk->seg_max >> 8) & 0xFF;
7443481ecdfSdv 		}
7453481ecdfSdv 		/* XXX handle invalid sz */
7463481ecdfSdv 		break;
74720e554f8Sdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 14:
7483481ecdfSdv 		if (sz == 1) {
7493481ecdfSdv 			data &= 0xFFFFFF00;
75020e554f8Sdv 			data |= (uint32_t)(vioblk->seg_max >> 16) & 0xFF;
7513481ecdfSdv 		} else if (sz == 2) {
7523481ecdfSdv 			data &= 0xFFFF0000;
75320e554f8Sdv 			data |= (uint32_t)(vioblk->seg_max >> 16)
7543481ecdfSdv 			    & 0xFFFF;
7553481ecdfSdv 		}
7563481ecdfSdv 		/* XXX handle invalid sz */
7573481ecdfSdv 		break;
75820e554f8Sdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 15:
7593481ecdfSdv 		if (sz == 1) {
7603481ecdfSdv 			data &= 0xFFFFFF00;
76120e554f8Sdv 			data |= (uint32_t)(vioblk->seg_max >> 24) & 0xFF;
7623481ecdfSdv 		}
7633481ecdfSdv 		/* XXX handle invalid sz */
7643481ecdfSdv 		break;
7653481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_FEATURES:
7663481ecdfSdv 		data = vioblk->cfg.device_feature;
7673481ecdfSdv 		break;
7683481ecdfSdv 	case VIRTIO_CONFIG_GUEST_FEATURES:
7693481ecdfSdv 		data = vioblk->cfg.guest_feature;
7703481ecdfSdv 		break;
7713481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_PFN:
7723481ecdfSdv 		data = vioblk->cfg.queue_pfn;
7733481ecdfSdv 		break;
7743481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_SIZE:
7753481ecdfSdv 		data = vioblk->cfg.queue_size;
7763481ecdfSdv 		break;
7773481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_SELECT:
7783481ecdfSdv 		data = vioblk->cfg.queue_select;
7793481ecdfSdv 		break;
7803481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_NOTIFY:
7813481ecdfSdv 		data = vioblk->cfg.queue_notify;
7823481ecdfSdv 		break;
7833481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_STATUS:
7843481ecdfSdv 		data = vioblk->cfg.device_status;
7853481ecdfSdv 		break;
7863481ecdfSdv 	case VIRTIO_CONFIG_ISR_STATUS:
7873481ecdfSdv 		data = vioblk->cfg.isr_status;
7883481ecdfSdv 		vioblk->cfg.isr_status = 0;
7898761e6b4Sdv 		if (intr != NULL)
7908761e6b4Sdv 			*intr = INTR_STATE_DEASSERT;
7913481ecdfSdv 		break;
7923481ecdfSdv 	default:
7933481ecdfSdv 		return (0xFFFFFFFF);
7943481ecdfSdv 	}
7953481ecdfSdv 
7963481ecdfSdv 	return (data);
7973481ecdfSdv }
79820e554f8Sdv 
79920e554f8Sdv /*
80020e554f8Sdv  * Emulate read/write io. Walks the descriptor chain, collecting io work and
80120e554f8Sdv  * then emulates the read or write.
80220e554f8Sdv  *
80320e554f8Sdv  * On success, returns bytes read/written.
80420e554f8Sdv  * On error, returns -1 and descriptor (desc) remains at its current position.
80520e554f8Sdv  */
80620e554f8Sdv static ssize_t
80720e554f8Sdv vioblk_rw(struct vioblk_dev *dev, int is_write, off_t offset,
80820e554f8Sdv     struct vring_desc *desc_tbl, struct vring_desc **desc)
80920e554f8Sdv {
81020e554f8Sdv 	struct iovec *iov = NULL;
81120e554f8Sdv 	ssize_t sz = 0;
81220e554f8Sdv 	size_t io_idx = 0;		/* Index into iovec workqueue. */
81320e554f8Sdv 	size_t xfer_sz = 0;		/* Total accumulated io bytes. */
81420e554f8Sdv 
81520e554f8Sdv 	do {
81620e554f8Sdv 		iov = &io_v[io_idx];
81720e554f8Sdv 
81820e554f8Sdv 		/*
81920e554f8Sdv 		 * Reads require writable descriptors. Writes require
82020e554f8Sdv 		 * non-writeable descriptors.
82120e554f8Sdv 		 */
82220e554f8Sdv 		if ((!is_write) ^ DESC_WRITABLE(*desc)) {
82320e554f8Sdv 			log_warnx("%s: invalid descriptor for %s command",
82420e554f8Sdv 			    __func__, is_write ? "write" : "read");
82520e554f8Sdv 			return (-1);
82620e554f8Sdv 		}
82720e554f8Sdv 
82820e554f8Sdv 		/* Collect the IO segment information. */
82920e554f8Sdv 		iov->iov_len = (size_t)(*desc)->len;
83020e554f8Sdv 		iov->iov_base = hvaddr_mem((*desc)->addr, iov->iov_len);
83120e554f8Sdv 		if (iov->iov_base == NULL)
83220e554f8Sdv 			return (-1);
83320e554f8Sdv 
83420e554f8Sdv 		/* Move our counters. */
83520e554f8Sdv 		xfer_sz += iov->iov_len;
83620e554f8Sdv 		io_idx++;
83720e554f8Sdv 
83820e554f8Sdv 		/* Guard against infinite chains */
83920e554f8Sdv 		if (io_idx >= nitems(io_v)) {
84020e554f8Sdv 			log_warnx("%s: descriptor table "
84120e554f8Sdv 			    "invalid", __func__);
84220e554f8Sdv 			return (-1);
84320e554f8Sdv 		}
84420e554f8Sdv 
84520e554f8Sdv 		/* Advance to the next descriptor. */
84620e554f8Sdv 		*desc = &desc_tbl[(*desc)->next & VIOBLK_QUEUE_MASK];
84720e554f8Sdv 	} while ((*desc)->flags & VRING_DESC_F_NEXT);
84820e554f8Sdv 
84920e554f8Sdv 	/*
85020e554f8Sdv 	 * Validate the requested block io operation alignment and size.
85120e554f8Sdv 	 * Checking offset is just an extra caution as it is derived from
85220e554f8Sdv 	 * a disk sector and is done for completeness in bounds checking.
85320e554f8Sdv 	 */
85420e554f8Sdv 	if (offset % VIRTIO_BLK_SECTOR_SIZE != 0 &&
85520e554f8Sdv 	    xfer_sz % VIRTIO_BLK_SECTOR_SIZE != 0) {
85620e554f8Sdv 		log_warnx("%s: unaligned read", __func__);
85720e554f8Sdv 		return (-1);
85820e554f8Sdv 	}
85920e554f8Sdv 	if (xfer_sz > SSIZE_MAX) {	/* iovec_copyin limit */
86020e554f8Sdv 		log_warnx("%s: invalid %s size: %zu", __func__,
86120e554f8Sdv 		    is_write ? "write" : "read", xfer_sz);
86220e554f8Sdv 		return (-1);
86320e554f8Sdv 	}
86420e554f8Sdv 
86520e554f8Sdv 	/* Emulate the Read or Write operation. */
86620e554f8Sdv 	if (is_write)
86720e554f8Sdv 		sz = dev->file.pwritev(dev->file.p, io_v, io_idx, offset);
86820e554f8Sdv 	else
86920e554f8Sdv 		sz = dev->file.preadv(dev->file.p, io_v, io_idx, offset);
87020e554f8Sdv 	if (sz != (ssize_t)xfer_sz) {
87120e554f8Sdv 		log_warnx("%s: %s failure at offset 0x%llx, xfer_sz=%zu, "
87220e554f8Sdv 		    "sz=%ld", __func__, (is_write ? "write" : "read"), offset,
87320e554f8Sdv 		    xfer_sz, sz);
87420e554f8Sdv 		return (-1);
87520e554f8Sdv 	}
87620e554f8Sdv 
87720e554f8Sdv 	return (sz);
87820e554f8Sdv }
879