xref: /openbsd-src/usr.sbin/vmd/vionet.c (revision 0a9d031fce78c0ebce0995b311938b1c87b1e208)
1*0a9d031fSclaudio /*	$OpenBSD: vionet.c,v 1.22 2024/11/21 13:39:34 claudio Exp $	*/
23481ecdfSdv 
33481ecdfSdv /*
43481ecdfSdv  * Copyright (c) 2023 Dave Voutila <dv@openbsd.org>
53481ecdfSdv  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
63481ecdfSdv  *
73481ecdfSdv  * Permission to use, copy, modify, and distribute this software for any
83481ecdfSdv  * purpose with or without fee is hereby granted, provided that the above
93481ecdfSdv  * copyright notice and this permission notice appear in all copies.
103481ecdfSdv  *
113481ecdfSdv  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
123481ecdfSdv  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
133481ecdfSdv  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
143481ecdfSdv  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
153481ecdfSdv  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
163481ecdfSdv  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
173481ecdfSdv  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
183481ecdfSdv  */
194d307b04Sdv #include <sys/types.h>
203481ecdfSdv 
213481ecdfSdv #include <dev/pci/virtio_pcireg.h>
223481ecdfSdv #include <dev/pv/virtioreg.h>
233481ecdfSdv 
243481ecdfSdv #include <net/if.h>
253481ecdfSdv #include <netinet/in.h>
263481ecdfSdv #include <netinet/if_ether.h>
273481ecdfSdv 
283481ecdfSdv #include <errno.h>
293481ecdfSdv #include <event.h>
303481ecdfSdv #include <fcntl.h>
31a246f7a0Sdv #include <pthread.h>
32a246f7a0Sdv #include <pthread_np.h>
333481ecdfSdv #include <stdlib.h>
343481ecdfSdv #include <string.h>
353481ecdfSdv #include <unistd.h>
363481ecdfSdv 
373481ecdfSdv #include "atomicio.h"
383481ecdfSdv #include "virtio.h"
393481ecdfSdv #include "vmd.h"
403481ecdfSdv 
413481ecdfSdv #define VIRTIO_NET_F_MAC	(1 << 5)
423481ecdfSdv #define RXQ	0
433481ecdfSdv #define TXQ	1
443481ecdfSdv 
453481ecdfSdv extern char *__progname;
463481ecdfSdv extern struct vmd_vm *current_vm;
473481ecdfSdv 
484d307b04Sdv struct packet {
494d307b04Sdv 	uint8_t	*buf;
504d307b04Sdv 	size_t	 len;
514d307b04Sdv };
523481ecdfSdv 
53a246f7a0Sdv static void *rx_run_loop(void *);
54a246f7a0Sdv static void *tx_run_loop(void *);
554d307b04Sdv static int vionet_rx(struct vionet_dev *, int);
564d307b04Sdv static ssize_t vionet_rx_copy(struct vionet_dev *, int, const struct iovec *,
574d307b04Sdv     int, size_t);
584d307b04Sdv static ssize_t vionet_rx_zerocopy(struct vionet_dev *, int,
594d307b04Sdv     const struct iovec *, int);
603481ecdfSdv static void vionet_rx_event(int, short, void *);
618761e6b4Sdv static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *,
628761e6b4Sdv     int8_t *);
63a246f7a0Sdv static void handle_io_write(struct viodev_msg *, struct virtio_dev *);
64a246f7a0Sdv static int vionet_tx(struct virtio_dev *);
65a246f7a0Sdv static void vionet_notifyq(struct virtio_dev *);
663481ecdfSdv static void dev_dispatch_vm(int, short, void *);
673481ecdfSdv static void handle_sync_io(int, short, void *);
68a246f7a0Sdv static void read_pipe_main(int, short, void *);
69a246f7a0Sdv static void read_pipe_rx(int, short, void *);
70a246f7a0Sdv static void read_pipe_tx(int, short, void *);
71a246f7a0Sdv static void vionet_assert_pic_irq(struct virtio_dev *);
72a246f7a0Sdv static void vionet_deassert_pic_irq(struct virtio_dev *);
733481ecdfSdv 
744d307b04Sdv /* Device Globals */
754d307b04Sdv struct event ev_tap;
764d307b04Sdv struct event ev_inject;
77a246f7a0Sdv struct event_base *ev_base_main;
78a246f7a0Sdv struct event_base *ev_base_rx;
79a246f7a0Sdv struct event_base *ev_base_tx;
80a246f7a0Sdv pthread_t rx_thread;
81a246f7a0Sdv pthread_t tx_thread;
82a246f7a0Sdv struct vm_dev_pipe pipe_main;
83a246f7a0Sdv struct vm_dev_pipe pipe_rx;
84a246f7a0Sdv struct vm_dev_pipe pipe_tx;
854d307b04Sdv int pipe_inject[2];
864d307b04Sdv #define READ	0
874d307b04Sdv #define WRITE	1
884d307b04Sdv struct iovec iov_rx[VIONET_QUEUE_SIZE];
894d307b04Sdv struct iovec iov_tx[VIONET_QUEUE_SIZE];
90a246f7a0Sdv pthread_rwlock_t lock = NULL;		/* Guards device config state. */
916b07697fSdv int resetting = 0;	/* Transient reset state used to coordinate reset. */
926b07697fSdv int rx_enabled = 0;	/* 1: we expect to read the tap, 0: wait for notify. */
934d307b04Sdv 
943481ecdfSdv __dead void
953c817da7Sdv vionet_main(int fd, int fd_vmm)
963481ecdfSdv {
973481ecdfSdv 	struct virtio_dev	 dev;
983481ecdfSdv 	struct vionet_dev	*vionet = NULL;
993481ecdfSdv 	struct viodev_msg 	 msg;
1003481ecdfSdv 	struct vmd_vm	 	 vm;
1013481ecdfSdv 	struct vm_create_params	*vcp;
1023481ecdfSdv 	ssize_t			 sz;
1033481ecdfSdv 	int			 ret;
1043481ecdfSdv 
1053c817da7Sdv 	/*
1063c817da7Sdv 	 * stdio - needed for read/write to disk fds and channels to the vm.
1073c817da7Sdv 	 * vmm + proc - needed to create shared vm mappings.
1083c817da7Sdv 	 */
1093c817da7Sdv 	if (pledge("stdio vmm proc", NULL) == -1)
1103481ecdfSdv 		fatal("pledge");
1113481ecdfSdv 
1124d307b04Sdv 	/* Initialize iovec arrays. */
1134d307b04Sdv 	memset(iov_rx, 0, sizeof(iov_rx));
1144d307b04Sdv 	memset(iov_tx, 0, sizeof(iov_tx));
1154d307b04Sdv 
1163481ecdfSdv 	/* Receive our vionet_dev, mostly preconfigured. */
1173481ecdfSdv 	sz = atomicio(read, fd, &dev, sizeof(dev));
1183481ecdfSdv 	if (sz != sizeof(dev)) {
1193481ecdfSdv 		ret = errno;
1203481ecdfSdv 		log_warn("failed to receive vionet");
1213481ecdfSdv 		goto fail;
1223481ecdfSdv 	}
1233481ecdfSdv 	if (dev.dev_type != VMD_DEVTYPE_NET) {
1243481ecdfSdv 		ret = EINVAL;
1253481ecdfSdv 		log_warn("received invalid device type");
1263481ecdfSdv 		goto fail;
1273481ecdfSdv 	}
1283481ecdfSdv 	dev.sync_fd = fd;
1293481ecdfSdv 	vionet = &dev.vionet;
1303481ecdfSdv 
1313c817da7Sdv 	log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d"
1323c817da7Sdv 	    ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd,
1333c817da7Sdv 	    dev.async_fd, fd_vmm);
1343481ecdfSdv 
1353481ecdfSdv 	/* Receive our vm information from the vm process. */
1363481ecdfSdv 	memset(&vm, 0, sizeof(vm));
1373481ecdfSdv 	sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm));
1383481ecdfSdv 	if (sz != sizeof(vm)) {
1393481ecdfSdv 		ret = EIO;
1403481ecdfSdv 		log_warnx("failed to receive vm details");
1413481ecdfSdv 		goto fail;
1423481ecdfSdv 	}
1433481ecdfSdv 	vcp = &vm.vm_params.vmc_params;
1443481ecdfSdv 	current_vm = &vm;
14508d0da61Sdv 	setproctitle("%s/vionet%d", vcp->vcp_name, vionet->idx);
14608d0da61Sdv 	log_procinit("vm/%s/vionet%d", vcp->vcp_name, vionet->idx);
1473481ecdfSdv 
1483481ecdfSdv 	/* Now that we have our vm information, we can remap memory. */
1493c817da7Sdv 	ret = remap_guest_mem(&vm, fd_vmm);
1503c817da7Sdv 	if (ret) {
1513c817da7Sdv 		fatal("%s: failed to remap", __func__);
1523481ecdfSdv 		goto fail;
1533c817da7Sdv 	}
1543c817da7Sdv 
1553c817da7Sdv 	/*
1563c817da7Sdv 	 * We no longer need /dev/vmm access.
1573c817da7Sdv 	 */
1583c817da7Sdv 	close_fd(fd_vmm);
1593c817da7Sdv 	if (pledge("stdio", NULL) == -1)
1603c817da7Sdv 		fatal("pledge2");
1613481ecdfSdv 
1623481ecdfSdv 	/* If we're restoring hardware, re-initialize virtqueue hva's. */
1633481ecdfSdv 	if (vm.vm_state & VM_STATE_RECEIVED) {
1643481ecdfSdv 		struct virtio_vq_info *vq_info;
1653481ecdfSdv 		void *hva = NULL;
1663481ecdfSdv 
1673481ecdfSdv 		vq_info = &dev.vionet.vq[TXQ];
1683481ecdfSdv 		if (vq_info->q_gpa != 0) {
1693481ecdfSdv 			log_debug("%s: restoring TX virtqueue for gpa 0x%llx",
1703481ecdfSdv 			    __func__, vq_info->q_gpa);
1713481ecdfSdv 			hva = hvaddr_mem(vq_info->q_gpa,
1723481ecdfSdv 			    vring_size(VIONET_QUEUE_SIZE));
1733481ecdfSdv 			if (hva == NULL)
1743481ecdfSdv 				fatalx("%s: hva == NULL", __func__);
1753481ecdfSdv 			vq_info->q_hva = hva;
1763481ecdfSdv 		}
1773481ecdfSdv 
1783481ecdfSdv 		vq_info = &dev.vionet.vq[RXQ];
1793481ecdfSdv 		if (vq_info->q_gpa != 0) {
1803481ecdfSdv 			log_debug("%s: restoring RX virtqueue for gpa 0x%llx",
1813481ecdfSdv 			    __func__, vq_info->q_gpa);
1823481ecdfSdv 			hva = hvaddr_mem(vq_info->q_gpa,
1833481ecdfSdv 			    vring_size(VIONET_QUEUE_SIZE));
1843481ecdfSdv 			if (hva == NULL)
1853481ecdfSdv 				fatalx("%s: hva == NULL", __func__);
1863481ecdfSdv 			vq_info->q_hva = hva;
1873481ecdfSdv 		}
1883481ecdfSdv 	}
1893481ecdfSdv 
1904d307b04Sdv 	/* Initialize our packet injection pipe. */
1914d307b04Sdv 	if (pipe2(pipe_inject, O_NONBLOCK) == -1) {
1924d307b04Sdv 		log_warn("%s: injection pipe", __func__);
1934d307b04Sdv 		goto fail;
1944d307b04Sdv 	}
1954d307b04Sdv 
196a246f7a0Sdv 	/* Initialize inter-thread communication channels. */
197a246f7a0Sdv 	vm_pipe_init2(&pipe_main, read_pipe_main, &dev);
198a246f7a0Sdv 	vm_pipe_init2(&pipe_rx, read_pipe_rx, &dev);
199a246f7a0Sdv 	vm_pipe_init2(&pipe_tx, read_pipe_tx, &dev);
200a246f7a0Sdv 
201a246f7a0Sdv 	/* Initialize RX and TX threads . */
202a246f7a0Sdv 	ret = pthread_create(&rx_thread, NULL, rx_run_loop, &dev);
203a246f7a0Sdv 	if (ret) {
204a246f7a0Sdv 		errno = ret;
205a246f7a0Sdv 		log_warn("%s: failed to initialize rx thread", __func__);
206a246f7a0Sdv 		goto fail;
207a246f7a0Sdv 	}
208a246f7a0Sdv 	pthread_set_name_np(rx_thread, "rx");
209a246f7a0Sdv 	ret = pthread_create(&tx_thread, NULL, tx_run_loop, &dev);
210a246f7a0Sdv 	if (ret) {
211a246f7a0Sdv 		errno = ret;
212a246f7a0Sdv 		log_warn("%s: failed to initialize tx thread", __func__);
213a246f7a0Sdv 		goto fail;
214a246f7a0Sdv 	}
215a246f7a0Sdv 	pthread_set_name_np(tx_thread, "tx");
216a246f7a0Sdv 
217a246f7a0Sdv 	/* Initialize our rwlock for guarding shared device state. */
218a246f7a0Sdv 	ret = pthread_rwlock_init(&lock, NULL);
219a246f7a0Sdv 	if (ret) {
220a246f7a0Sdv 		errno = ret;
221a246f7a0Sdv 		log_warn("%s: failed to initialize rwlock", __func__);
222a246f7a0Sdv 		goto fail;
223a246f7a0Sdv 	}
224a246f7a0Sdv 
2253481ecdfSdv 	/* Initialize libevent so we can start wiring event handlers. */
226a246f7a0Sdv 	ev_base_main = event_base_new();
227a246f7a0Sdv 
228a246f7a0Sdv 	/* Add our handler for receiving messages from the RX/TX threads. */
229a246f7a0Sdv 	event_base_set(ev_base_main, &pipe_main.read_ev);
230a246f7a0Sdv 	event_add(&pipe_main.read_ev, NULL);
2313481ecdfSdv 
2323481ecdfSdv 	/* Wire up an async imsg channel. */
2333481ecdfSdv 	log_debug("%s: wiring in async vm event handler (fd=%d)", __func__,
2343481ecdfSdv 		dev.async_fd);
235a246f7a0Sdv 	if (vm_device_pipe(&dev, dev_dispatch_vm, ev_base_main)) {
2363481ecdfSdv 		ret = EIO;
2373481ecdfSdv 		log_warnx("vm_device_pipe");
2383481ecdfSdv 		goto fail;
2393481ecdfSdv 	}
2403481ecdfSdv 
2413481ecdfSdv 	/* Configure our sync channel event handler. */
2423481ecdfSdv 	log_debug("%s: wiring in sync channel handler (fd=%d)", __func__,
2433481ecdfSdv 		dev.sync_fd);
244*0a9d031fSclaudio 	if (imsgbuf_init(&dev.sync_iev.ibuf, dev.sync_fd) == -1) {
245*0a9d031fSclaudio 		log_warnx("imsgbuf_init");
246*0a9d031fSclaudio 		goto fail;
247*0a9d031fSclaudio 	}
248*0a9d031fSclaudio 	imsgbuf_allow_fdpass(&dev.sync_iev.ibuf);
2493481ecdfSdv 	dev.sync_iev.handler = handle_sync_io;
2503481ecdfSdv 	dev.sync_iev.data = &dev;
2513481ecdfSdv 	dev.sync_iev.events = EV_READ;
252a246f7a0Sdv 	imsg_event_add2(&dev.sync_iev, ev_base_main);
2533481ecdfSdv 
2543481ecdfSdv 	/* Send a ready message over the sync channel. */
2553481ecdfSdv 	log_debug("%s: telling vm %s device is ready", __func__, vcp->vcp_name);
2563481ecdfSdv 	memset(&msg, 0, sizeof(msg));
2573481ecdfSdv 	msg.type = VIODEV_MSG_READY;
258a246f7a0Sdv 	imsg_compose_event2(&dev.sync_iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
259a246f7a0Sdv 	    sizeof(msg), ev_base_main);
2603481ecdfSdv 
2613481ecdfSdv 	/* Send a ready message over the async channel. */
2623481ecdfSdv 	log_debug("%s: sending async ready message", __func__);
263a246f7a0Sdv 	ret = imsg_compose_event2(&dev.async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
264a246f7a0Sdv 	    &msg, sizeof(msg), ev_base_main);
2653481ecdfSdv 	if (ret == -1) {
2663481ecdfSdv 		log_warnx("%s: failed to send async ready message!", __func__);
2673481ecdfSdv 		goto fail;
2683481ecdfSdv 	}
2693481ecdfSdv 
2703481ecdfSdv 	/* Engage the event loop! */
271a246f7a0Sdv 	ret = event_base_dispatch(ev_base_main);
272a246f7a0Sdv 	event_base_free(ev_base_main);
273a246f7a0Sdv 
274a246f7a0Sdv 	/* Try stopping the rx & tx threads cleanly by messaging them. */
275a246f7a0Sdv 	vm_pipe_send(&pipe_rx, VIRTIO_THREAD_STOP);
276a246f7a0Sdv 	vm_pipe_send(&pipe_tx, VIRTIO_THREAD_STOP);
277a246f7a0Sdv 
278a246f7a0Sdv 	/* Wait for threads to stop. */
279a246f7a0Sdv 	pthread_join(rx_thread, NULL);
280a246f7a0Sdv 	pthread_join(tx_thread, NULL);
281a246f7a0Sdv 	pthread_rwlock_destroy(&lock);
2823481ecdfSdv 
2833481ecdfSdv 	/* Cleanup */
2843481ecdfSdv 	if (ret == 0) {
2853481ecdfSdv 		close_fd(dev.sync_fd);
2863481ecdfSdv 		close_fd(dev.async_fd);
2873481ecdfSdv 		close_fd(vionet->data_fd);
288a246f7a0Sdv 		close_fd(pipe_main.read);
289a246f7a0Sdv 		close_fd(pipe_main.write);
290a246f7a0Sdv 		close_fd(pipe_rx.write);
291a246f7a0Sdv 		close_fd(pipe_tx.write);
2924d307b04Sdv 		close_fd(pipe_inject[READ]);
2934d307b04Sdv 		close_fd(pipe_inject[WRITE]);
2943481ecdfSdv 		_exit(ret);
2953481ecdfSdv 		/* NOTREACHED */
2963481ecdfSdv 	}
2973481ecdfSdv fail:
2983481ecdfSdv 	/* Try firing off a message to the vm saying we're dying. */
2993481ecdfSdv 	memset(&msg, 0, sizeof(msg));
3003481ecdfSdv 	msg.type = VIODEV_MSG_ERROR;
3013481ecdfSdv 	msg.data = ret;
3023481ecdfSdv 	imsg_compose(&dev.sync_iev.ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
3033481ecdfSdv 	    sizeof(msg));
304dd7efffeSclaudio 	imsgbuf_flush(&dev.sync_iev.ibuf);
3053481ecdfSdv 
3063481ecdfSdv 	close_fd(dev.sync_fd);
3073481ecdfSdv 	close_fd(dev.async_fd);
3084d307b04Sdv 	close_fd(pipe_inject[READ]);
3094d307b04Sdv 	close_fd(pipe_inject[WRITE]);
3103481ecdfSdv 	if (vionet != NULL)
3113481ecdfSdv 		close_fd(vionet->data_fd);
312a246f7a0Sdv 	if (lock != NULL)
313a246f7a0Sdv 		pthread_rwlock_destroy(&lock);
3143481ecdfSdv 	_exit(ret);
3153481ecdfSdv }
3163481ecdfSdv 
3173481ecdfSdv /*
3183481ecdfSdv  * Update the gpa and hva of the virtqueue.
3193481ecdfSdv  */
3204d307b04Sdv static void
3213481ecdfSdv vionet_update_qa(struct vionet_dev *dev)
3223481ecdfSdv {
3233481ecdfSdv 	struct virtio_vq_info *vq_info;
3243481ecdfSdv 	void *hva = NULL;
3253481ecdfSdv 
3263481ecdfSdv 	/* Invalid queue? */
3273481ecdfSdv 	if (dev->cfg.queue_select > 1)
3283481ecdfSdv 		return;
3293481ecdfSdv 
3303481ecdfSdv 	vq_info = &dev->vq[dev->cfg.queue_select];
3313481ecdfSdv 	vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE;
3323481ecdfSdv 	dev->cfg.queue_pfn = vq_info->q_gpa >> 12;
3333481ecdfSdv 
3343481ecdfSdv 	if (vq_info->q_gpa == 0)
3353481ecdfSdv 		vq_info->q_hva = NULL;
3363481ecdfSdv 
3373481ecdfSdv 	hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIONET_QUEUE_SIZE));
3383481ecdfSdv 	if (hva == NULL)
3393481ecdfSdv 		fatalx("%s: hva == NULL", __func__);
3403481ecdfSdv 
3413481ecdfSdv 	vq_info->q_hva = hva;
3423481ecdfSdv }
3433481ecdfSdv 
3443481ecdfSdv /*
3453481ecdfSdv  * Update the queue size.
3463481ecdfSdv  */
3474d307b04Sdv static void
3483481ecdfSdv vionet_update_qs(struct vionet_dev *dev)
3493481ecdfSdv {
3503481ecdfSdv 	struct virtio_vq_info *vq_info;
3513481ecdfSdv 
3523481ecdfSdv 	/* Invalid queue? */
3533481ecdfSdv 	if (dev->cfg.queue_select > 1) {
3543481ecdfSdv 		log_warnx("%s: !!! invalid queue selector %d", __func__,
3553481ecdfSdv 		    dev->cfg.queue_select);
3563481ecdfSdv 		dev->cfg.queue_size = 0;
3573481ecdfSdv 		return;
3583481ecdfSdv 	}
3593481ecdfSdv 
3603481ecdfSdv 	vq_info = &dev->vq[dev->cfg.queue_select];
3613481ecdfSdv 
3623481ecdfSdv 	/* Update queue pfn/size based on queue select */
3633481ecdfSdv 	dev->cfg.queue_pfn = vq_info->q_gpa >> 12;
3643481ecdfSdv 	dev->cfg.queue_size = vq_info->qs;
3653481ecdfSdv }
3663481ecdfSdv 
3673481ecdfSdv /*
3684d307b04Sdv  * vionet_rx
3693481ecdfSdv  *
3704d307b04Sdv  * Pull packet from the provided fd and fill the receive-side virtqueue. We
3714d307b04Sdv  * selectively use zero-copy approaches when possible.
3724d307b04Sdv  *
3734d307b04Sdv  * Returns 1 if guest notification is needed. Otherwise, returns -1 on failure
3744d307b04Sdv  * or 0 if no notification is needed.
3753481ecdfSdv  */
3764d307b04Sdv static int
3774d307b04Sdv vionet_rx(struct vionet_dev *dev, int fd)
3783481ecdfSdv {
3794d307b04Sdv 	uint16_t idx, hdr_idx;
3803481ecdfSdv 	char *vr = NULL;
3814d307b04Sdv 	size_t chain_len = 0, iov_cnt;
3824d307b04Sdv 	struct vring_desc *desc, *table;
3833481ecdfSdv 	struct vring_avail *avail;
3843481ecdfSdv 	struct vring_used *used;
3853481ecdfSdv 	struct virtio_vq_info *vq_info;
3864d307b04Sdv 	struct iovec *iov;
3874d307b04Sdv 	int notify = 0;
3884d307b04Sdv 	ssize_t sz;
389a246f7a0Sdv 	uint8_t status = 0;
3903481ecdfSdv 
391a246f7a0Sdv 	status = dev->cfg.device_status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK;
392a246f7a0Sdv 	if (status != VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) {
3933481ecdfSdv 		log_warnx("%s: driver not ready", __func__);
3943481ecdfSdv 		return (0);
3953481ecdfSdv 	}
3963481ecdfSdv 
3973481ecdfSdv 	vq_info = &dev->vq[RXQ];
3984d307b04Sdv 	idx = vq_info->last_avail;
3993481ecdfSdv 	vr = vq_info->q_hva;
4003481ecdfSdv 	if (vr == NULL)
4013481ecdfSdv 		fatalx("%s: vr == NULL", __func__);
4023481ecdfSdv 
4033481ecdfSdv 	/* Compute offsets in ring of descriptors, avail ring, and used ring */
4044d307b04Sdv 	table = (struct vring_desc *)(vr);
4053481ecdfSdv 	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
4063481ecdfSdv 	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
4074d307b04Sdv 	used->flags |= VRING_USED_F_NO_NOTIFY;
4083481ecdfSdv 
4094d307b04Sdv 	while (idx != avail->idx) {
4104d307b04Sdv 		hdr_idx = avail->ring[idx & VIONET_QUEUE_MASK];
4114d307b04Sdv 		desc = &table[hdr_idx & VIONET_QUEUE_MASK];
4124d307b04Sdv 		if (!DESC_WRITABLE(desc)) {
4134d307b04Sdv 			log_warnx("%s: invalid descriptor state", __func__);
4144d307b04Sdv 			goto reset;
4153481ecdfSdv 		}
4163481ecdfSdv 
4174d307b04Sdv 		iov = &iov_rx[0];
4184d307b04Sdv 		iov_cnt = 1;
4193481ecdfSdv 
4204d307b04Sdv 		/*
4214d307b04Sdv 		 * First descriptor should be at least as large as the
4224d307b04Sdv 		 * virtio_net_hdr. It's not technically required, but in
4234d307b04Sdv 		 * legacy devices it should be safe to assume.
4244d307b04Sdv 		 */
4254d307b04Sdv 		iov->iov_len = desc->len;
4264d307b04Sdv 		if (iov->iov_len < sizeof(struct virtio_net_hdr)) {
4274d307b04Sdv 			log_warnx("%s: invalid descriptor length", __func__);
4284d307b04Sdv 			goto reset;
4294d307b04Sdv 		}
4303481ecdfSdv 
4314d307b04Sdv 		/*
4324d307b04Sdv 		 * Insert the virtio_net_hdr and adjust len/base. We do the
4334d307b04Sdv 		 * pointer math here before it's a void*.
4344d307b04Sdv 		 */
4354d307b04Sdv 		iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
4364d307b04Sdv 		if (iov->iov_base == NULL)
4374d307b04Sdv 			goto reset;
4384d307b04Sdv 		memset(iov->iov_base, 0, sizeof(struct virtio_net_hdr));
4394d307b04Sdv 
4404d307b04Sdv 		/* Tweak the iovec to account for the virtio_net_hdr. */
4414d307b04Sdv 		iov->iov_len -= sizeof(struct virtio_net_hdr);
4424d307b04Sdv 		iov->iov_base = hvaddr_mem(desc->addr +
4434d307b04Sdv 		    sizeof(struct virtio_net_hdr), iov->iov_len);
4444d307b04Sdv 		if (iov->iov_base == NULL)
4454d307b04Sdv 			goto reset;
4464d307b04Sdv 		chain_len = iov->iov_len;
4474d307b04Sdv 
4484d307b04Sdv 		/*
4494d307b04Sdv 		 * Walk the remaining chain and collect remaining addresses
4504d307b04Sdv 		 * and lengths.
4514d307b04Sdv 		 */
4524d307b04Sdv 		while (desc->flags & VRING_DESC_F_NEXT) {
4534d307b04Sdv 			desc = &table[desc->next & VIONET_QUEUE_MASK];
4544d307b04Sdv 			if (!DESC_WRITABLE(desc)) {
4554d307b04Sdv 				log_warnx("%s: invalid descriptor state",
4563481ecdfSdv 				    __func__);
4574d307b04Sdv 				goto reset;
4583481ecdfSdv 			}
4593481ecdfSdv 
4604d307b04Sdv 			/* Collect our IO information. Translate gpa's. */
4614d307b04Sdv 			iov = &iov_rx[iov_cnt];
4624d307b04Sdv 			iov->iov_len = desc->len;
4634d307b04Sdv 			iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
4644d307b04Sdv 			if (iov->iov_base == NULL)
4654d307b04Sdv 				goto reset;
4664d307b04Sdv 			chain_len += iov->iov_len;
4674d307b04Sdv 
4684d307b04Sdv 			/* Guard against infinitely looping chains. */
4694d307b04Sdv 			if (++iov_cnt >= nitems(iov_rx)) {
4704d307b04Sdv 				log_warnx("%s: infinite chain detected",
4714d307b04Sdv 				    __func__);
4724d307b04Sdv 				goto reset;
4734d307b04Sdv 			}
4744d307b04Sdv 		}
4754d307b04Sdv 
4764d307b04Sdv 		/* Make sure the driver gave us the bare minimum buffers. */
4774d307b04Sdv 		if (chain_len < VIONET_MIN_TXLEN) {
4784d307b04Sdv 			log_warnx("%s: insufficient buffers provided",
4794d307b04Sdv 			    __func__);
4804d307b04Sdv 			goto reset;
4814d307b04Sdv 		}
4824d307b04Sdv 
4834d307b04Sdv 		/*
4844d307b04Sdv 		 * If we're enforcing hardware address or handling an injected
4854d307b04Sdv 		 * packet, we need to use a copy-based approach.
4864d307b04Sdv 		 */
4874d307b04Sdv 		if (dev->lockedmac || fd != dev->data_fd)
4884d307b04Sdv 			sz = vionet_rx_copy(dev, fd, iov_rx, iov_cnt,
4894d307b04Sdv 			    chain_len);
4903481ecdfSdv 		else
4914d307b04Sdv 			sz = vionet_rx_zerocopy(dev, fd, iov_rx, iov_cnt);
4924d307b04Sdv 		if (sz == -1)
4934d307b04Sdv 			goto reset;
4944d307b04Sdv 		if (sz == 0)	/* No packets, so bail out for now. */
4954d307b04Sdv 			break;
4963481ecdfSdv 
4974d307b04Sdv 		/*
4984d307b04Sdv 		 * Account for the prefixed header since it wasn't included
4994d307b04Sdv 		 * in the copy or zerocopy operations.
5004d307b04Sdv 		 */
5014d307b04Sdv 		sz += sizeof(struct virtio_net_hdr);
5023481ecdfSdv 
5034d307b04Sdv 		/* Mark our buffers as used. */
5044d307b04Sdv 		used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_idx;
5054d307b04Sdv 		used->ring[used->idx & VIONET_QUEUE_MASK].len = sz;
5063481ecdfSdv 		__sync_synchronize();
5073481ecdfSdv 		used->idx++;
5084d307b04Sdv 		idx++;
5094d307b04Sdv 	}
5103481ecdfSdv 
5114d307b04Sdv 	if (idx != vq_info->last_avail &&
5124d307b04Sdv 	    !(avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
5134d307b04Sdv 		notify = 1;
5144d307b04Sdv 	}
5154d307b04Sdv 
5164d307b04Sdv 	vq_info->last_avail = idx;
5174d307b04Sdv 	return (notify);
5184d307b04Sdv reset:
519a246f7a0Sdv 	return (-1);
5203481ecdfSdv }
5213481ecdfSdv 
5223481ecdfSdv /*
5234d307b04Sdv  * vionet_rx_copy
5243481ecdfSdv  *
5254d307b04Sdv  * Read a packet off the provided file descriptor, validating packet
5264d307b04Sdv  * characteristics, and copy into the provided buffers in the iovec array.
5274d307b04Sdv  *
5284d307b04Sdv  * It's assumed that the provided iovec array contains validated host virtual
5294d307b04Sdv  * address translations and not guest physical addreses.
5304d307b04Sdv  *
5314d307b04Sdv  * Returns number of bytes copied on success, 0 if packet is dropped, and
5324d307b04Sdv  * -1 on an error.
5333481ecdfSdv  */
5344d307b04Sdv ssize_t
5354d307b04Sdv vionet_rx_copy(struct vionet_dev *dev, int fd, const struct iovec *iov,
5364d307b04Sdv     int iov_cnt, size_t chain_len)
5373481ecdfSdv {
5384d307b04Sdv 	static uint8_t		 buf[VIONET_HARD_MTU];
5394d307b04Sdv 	struct packet		*pkt = NULL;
5404d307b04Sdv 	struct ether_header	*eh = NULL;
5414d307b04Sdv 	uint8_t			*payload = buf;
5424d307b04Sdv 	size_t			 i, chunk, nbytes, copied = 0;
5433481ecdfSdv 	ssize_t			 sz;
5443481ecdfSdv 
5454d307b04Sdv 	/* If reading from the tap(4), try to right-size the read. */
5464d307b04Sdv 	if (fd == dev->data_fd)
5474d307b04Sdv 		nbytes = MIN(chain_len, VIONET_HARD_MTU);
5484d307b04Sdv 	else if (fd == pipe_inject[READ])
5494d307b04Sdv 		nbytes = sizeof(struct packet);
5504d307b04Sdv 	else {
5514d307b04Sdv 		log_warnx("%s: invalid fd: %d", __func__, fd);
5524d307b04Sdv 		return (-1);
5533481ecdfSdv 	}
5543481ecdfSdv 
5554d307b04Sdv 	/*
5564d307b04Sdv 	 * Try to pull a packet. The fd should be non-blocking and we don't
5574d307b04Sdv 	 * care if we under-read (i.e. sz != nbytes) as we may not have a
5584d307b04Sdv 	 * packet large enough to fill the buffer.
5594d307b04Sdv 	 */
5604d307b04Sdv 	sz = read(fd, buf, nbytes);
5614d307b04Sdv 	if (sz == -1) {
5624d307b04Sdv 		if (errno != EAGAIN) {
5634d307b04Sdv 			log_warn("%s: error reading packet", __func__);
5644d307b04Sdv 			return (-1);
5653481ecdfSdv 		}
5664d307b04Sdv 		return (0);
5674d307b04Sdv 	} else if (fd == dev->data_fd && sz < VIONET_MIN_TXLEN) {
5684d307b04Sdv 		/* If reading the tap(4), we should get valid ethernet. */
5694d307b04Sdv 		log_warnx("%s: invalid packet size", __func__);
5704d307b04Sdv 		return (0);
571b72f0742Sdv 	} else if (fd == pipe_inject[READ] && sz != sizeof(struct packet)) {
572b72f0742Sdv 		log_warnx("%s: invalid injected packet object (sz=%ld)",
573b72f0742Sdv 		    __func__, sz);
5744d307b04Sdv 		return (0);
5754d307b04Sdv 	}
5764d307b04Sdv 
5774d307b04Sdv 	/* Decompose an injected packet, if that's what we're working with. */
5784d307b04Sdv 	if (fd == pipe_inject[READ]) {
5794d307b04Sdv 		pkt = (struct packet *)buf;
5804d307b04Sdv 		if (pkt->buf == NULL) {
5814d307b04Sdv 			log_warnx("%s: invalid injected packet, no buffer",
5824d307b04Sdv 			    __func__);
5834d307b04Sdv 			return (0);
5844d307b04Sdv 		}
5854d307b04Sdv 		if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) {
5864d307b04Sdv 			log_warnx("%s: invalid injected packet size", __func__);
5874d307b04Sdv 			goto drop;
5884d307b04Sdv 		}
5894d307b04Sdv 		payload = pkt->buf;
5904d307b04Sdv 		sz = (ssize_t)pkt->len;
5914d307b04Sdv 	}
5924d307b04Sdv 
5934d307b04Sdv 	/* Validate the ethernet header, if required. */
5944d307b04Sdv 	if (dev->lockedmac) {
5954d307b04Sdv 		eh = (struct ether_header *)(payload);
5964d307b04Sdv 		if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
5974d307b04Sdv 		    memcmp(eh->ether_dhost, dev->mac,
5984d307b04Sdv 		    sizeof(eh->ether_dhost)) != 0)
5994d307b04Sdv 			goto drop;
6004d307b04Sdv 	}
6014d307b04Sdv 
6024d307b04Sdv 	/* Truncate one last time to the chain length, if shorter. */
6034d307b04Sdv 	sz = MIN(chain_len, (size_t)sz);
6044d307b04Sdv 
6054d307b04Sdv 	/*
6064d307b04Sdv 	 * Copy the packet into the provided buffers. We can use memcpy(3)
6074d307b04Sdv 	 * here as the gpa was validated and translated to an hva previously.
6084d307b04Sdv 	 */
6094d307b04Sdv 	for (i = 0; (int)i < iov_cnt && (size_t)sz > copied; i++) {
6104d307b04Sdv 		chunk = MIN(iov[i].iov_len, (size_t)(sz - copied));
6114d307b04Sdv 		memcpy(iov[i].iov_base, payload + copied, chunk);
6124d307b04Sdv 		copied += chunk;
6134d307b04Sdv 	}
6144d307b04Sdv 
6154d307b04Sdv drop:
6164d307b04Sdv 	/* Free any injected packet buffer. */
6174d307b04Sdv 	if (pkt != NULL)
6184d307b04Sdv 		free(pkt->buf);
6194d307b04Sdv 
6204d307b04Sdv 	return (copied);
6214d307b04Sdv }
6224d307b04Sdv 
6234d307b04Sdv /*
6244d307b04Sdv  * vionet_rx_zerocopy
6254d307b04Sdv  *
6264d307b04Sdv  * Perform a vectorized read from the given fd into the guest physical memory
6274d307b04Sdv  * pointed to by iovecs.
6284d307b04Sdv  *
6294d307b04Sdv  * Returns number of bytes read on success, -1 on error, or 0 if EAGAIN was
6304d307b04Sdv  * returned by readv.
6314d307b04Sdv  *
6324d307b04Sdv  */
6334d307b04Sdv static ssize_t
6344d307b04Sdv vionet_rx_zerocopy(struct vionet_dev *dev, int fd, const struct iovec *iov,
6354d307b04Sdv     int iov_cnt)
6364d307b04Sdv {
6374d307b04Sdv 	ssize_t		sz;
6384d307b04Sdv 
6394d307b04Sdv 	if (dev->lockedmac) {
6404d307b04Sdv 		log_warnx("%s: zerocopy not available for locked lladdr",
6414d307b04Sdv 		    __func__);
6424d307b04Sdv 		return (-1);
6434d307b04Sdv 	}
6444d307b04Sdv 
6454d307b04Sdv 	sz = readv(fd, iov, iov_cnt);
6464d307b04Sdv 	if (sz == -1 && errno == EAGAIN)
6474d307b04Sdv 		return (0);
6484d307b04Sdv 	return (sz);
6494d307b04Sdv }
6504d307b04Sdv 
6513481ecdfSdv 
6523481ecdfSdv /*
6533481ecdfSdv  * vionet_rx_event
6543481ecdfSdv  *
6553481ecdfSdv  * Called when new data can be received on the tap fd of a vionet device.
6563481ecdfSdv  */
6573481ecdfSdv static void
6584d307b04Sdv vionet_rx_event(int fd, short event, void *arg)
6593481ecdfSdv {
6603481ecdfSdv 	struct virtio_dev	*dev = (struct virtio_dev *)arg;
661a246f7a0Sdv 	struct vionet_dev	*vionet = &dev->vionet;
662a246f7a0Sdv 	int			 ret = 0;
6633481ecdfSdv 
6644d307b04Sdv 	if (!(event & EV_READ))
6654d307b04Sdv 		fatalx("%s: invalid event type", __func__);
6664d307b04Sdv 
667a246f7a0Sdv 	pthread_rwlock_rdlock(&lock);
668a246f7a0Sdv 	ret = vionet_rx(vionet, fd);
669a246f7a0Sdv 	pthread_rwlock_unlock(&lock);
670a246f7a0Sdv 
671a246f7a0Sdv 	if (ret == 0) {
672a246f7a0Sdv 		/* Nothing to do. */
673a246f7a0Sdv 		return;
6743481ecdfSdv 	}
6753481ecdfSdv 
676a246f7a0Sdv 	pthread_rwlock_wrlock(&lock);
677a246f7a0Sdv 	if (ret == 1) {
678a246f7a0Sdv 		/* Notify the driver. */
679a246f7a0Sdv 		vionet->cfg.isr_status |= 1;
680a246f7a0Sdv 	} else {
681a246f7a0Sdv 		/* Need a reset. Something went wrong. */
682a246f7a0Sdv 		log_warnx("%s: requesting device reset", __func__);
683a246f7a0Sdv 		vionet->cfg.device_status |= DEVICE_NEEDS_RESET;
684a246f7a0Sdv 		vionet->cfg.isr_status |= VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
685a246f7a0Sdv 	}
686a246f7a0Sdv 	pthread_rwlock_unlock(&lock);
687a246f7a0Sdv 
688a246f7a0Sdv 	vm_pipe_send(&pipe_main, VIRTIO_RAISE_IRQ);
689a246f7a0Sdv }
690a246f7a0Sdv 
691a246f7a0Sdv static void
6923481ecdfSdv vionet_notifyq(struct virtio_dev *dev)
6933481ecdfSdv {
6943481ecdfSdv 	struct vionet_dev	*vionet = &dev->vionet;
6953481ecdfSdv 
6963481ecdfSdv 	switch (vionet->cfg.queue_notify) {
69725efc169Sdv 	case RXQ:
6986b07697fSdv 		rx_enabled = 1;
699a246f7a0Sdv 		vm_pipe_send(&pipe_rx, VIRTIO_NOTIFY);
70025efc169Sdv 		break;
70125efc169Sdv 	case TXQ:
702a246f7a0Sdv 		vm_pipe_send(&pipe_tx, VIRTIO_NOTIFY);
703a246f7a0Sdv 		break;
7043481ecdfSdv 	default:
7053481ecdfSdv 		/*
7063481ecdfSdv 		 * Catch the unimplemented queue ID 2 (control queue) as
7073481ecdfSdv 		 * well as any bogus queue IDs.
7083481ecdfSdv 		 */
7093481ecdfSdv 		log_debug("%s: notify for unimplemented queue ID %d",
7103481ecdfSdv 		    __func__, vionet->cfg.queue_notify);
7113481ecdfSdv 		break;
7123481ecdfSdv 	}
7133481ecdfSdv }
7143481ecdfSdv 
7154d307b04Sdv static int
716a246f7a0Sdv vionet_tx(struct virtio_dev *dev)
7173481ecdfSdv {
7184d307b04Sdv 	uint16_t idx, hdr_idx;
7194d307b04Sdv 	size_t chain_len, iov_cnt;
7204d307b04Sdv 	ssize_t dhcpsz = 0, sz;
7214d307b04Sdv 	int notify = 0;
7224d307b04Sdv 	char *vr = NULL, *dhcppkt = NULL;
7233481ecdfSdv 	struct vionet_dev *vionet = &dev->vionet;
7244d307b04Sdv 	struct vring_desc *desc, *table;
7253481ecdfSdv 	struct vring_avail *avail;
7263481ecdfSdv 	struct vring_used *used;
7273481ecdfSdv 	struct virtio_vq_info *vq_info;
7283481ecdfSdv 	struct ether_header *eh;
7294d307b04Sdv 	struct iovec *iov;
7304d307b04Sdv 	struct packet pkt;
731a246f7a0Sdv 	uint8_t status = 0;
7323481ecdfSdv 
733a246f7a0Sdv 	status = vionet->cfg.device_status
734a246f7a0Sdv 	    & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK;
735a246f7a0Sdv 	if (status != VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) {
7363481ecdfSdv 		log_warnx("%s: driver not ready", __func__);
7373481ecdfSdv 		return (0);
7383481ecdfSdv 	}
7393481ecdfSdv 
7403481ecdfSdv 	vq_info = &vionet->vq[TXQ];
7414d307b04Sdv 	idx = vq_info->last_avail;
7423481ecdfSdv 	vr = vq_info->q_hva;
7433481ecdfSdv 	if (vr == NULL)
7443481ecdfSdv 		fatalx("%s: vr == NULL", __func__);
7453481ecdfSdv 
7463481ecdfSdv 	/* Compute offsets in ring of descriptors, avail ring, and used ring */
7474d307b04Sdv 	table = (struct vring_desc *)(vr);
7483481ecdfSdv 	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
7493481ecdfSdv 	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
7503481ecdfSdv 
7514d307b04Sdv 	while (idx != avail->idx) {
7524d307b04Sdv 		hdr_idx = avail->ring[idx & VIONET_QUEUE_MASK];
7534d307b04Sdv 		desc = &table[hdr_idx & VIONET_QUEUE_MASK];
7544d307b04Sdv 		if (DESC_WRITABLE(desc)) {
7554d307b04Sdv 			log_warnx("%s: invalid descriptor state", __func__);
7564d307b04Sdv 			goto reset;
7574d307b04Sdv 		}
7583481ecdfSdv 
7594d307b04Sdv 		iov = &iov_tx[0];
7604d307b04Sdv 		iov_cnt = 0;
7614d307b04Sdv 		chain_len = 0;
7623481ecdfSdv 
7633481ecdfSdv 		/*
7644d307b04Sdv 		 * As a legacy device, we most likely will receive a lead
7654d307b04Sdv 		 * descriptor sized to the virtio_net_hdr. However, the framing
7664d307b04Sdv 		 * is not guaranteed, so check for packet data.
7673481ecdfSdv 		 */
7684d307b04Sdv 		iov->iov_len = desc->len;
7694d307b04Sdv 		if (iov->iov_len < sizeof(struct virtio_net_hdr)) {
7704d307b04Sdv 			log_warnx("%s: invalid descriptor length", __func__);
7714d307b04Sdv 			goto reset;
7724d307b04Sdv 		} else if (iov->iov_len > sizeof(struct virtio_net_hdr)) {
7734d307b04Sdv 			/* Chop off the virtio header, leaving packet data. */
7744d307b04Sdv 			iov->iov_len -= sizeof(struct virtio_net_hdr);
7754d307b04Sdv 			chain_len += iov->iov_len;
7764d307b04Sdv 			iov->iov_base = hvaddr_mem(desc->addr +
7774d307b04Sdv 			    sizeof(struct virtio_net_hdr), iov->iov_len);
7784d307b04Sdv 			if (iov->iov_base == NULL)
7794d307b04Sdv 				goto reset;
7804d307b04Sdv 			iov_cnt++;
7814d307b04Sdv 		}
7824d307b04Sdv 
7834d307b04Sdv 		/*
7844d307b04Sdv 		 * Walk the chain and collect remaining addresses and lengths.
7854d307b04Sdv 		 */
7864d307b04Sdv 		while (desc->flags & VRING_DESC_F_NEXT) {
7874d307b04Sdv 			desc = &table[desc->next & VIONET_QUEUE_MASK];
7884d307b04Sdv 			if (DESC_WRITABLE(desc)) {
7894d307b04Sdv 				log_warnx("%s: invalid descriptor state",
7903481ecdfSdv 				    __func__);
7914d307b04Sdv 				goto reset;
7923481ecdfSdv 			}
7933481ecdfSdv 
7944d307b04Sdv 			/* Collect our IO information, translating gpa's. */
7954d307b04Sdv 			iov = &iov_tx[iov_cnt];
7964d307b04Sdv 			iov->iov_len = desc->len;
7974d307b04Sdv 			iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
7984d307b04Sdv 			if (iov->iov_base == NULL)
7994d307b04Sdv 				goto reset;
8004d307b04Sdv 			chain_len += iov->iov_len;
8013481ecdfSdv 
8024d307b04Sdv 			/* Guard against infinitely looping chains. */
8034d307b04Sdv 			if (++iov_cnt >= nitems(iov_tx)) {
8044d307b04Sdv 				log_warnx("%s: infinite chain detected",
8053481ecdfSdv 				    __func__);
8064d307b04Sdv 				goto reset;
8074d307b04Sdv 			}
8083481ecdfSdv 		}
8093481ecdfSdv 
8104d307b04Sdv 		/* Check if we've got a minimum viable amount of data. */
811c01d0122Sjan 		if (chain_len < VIONET_MIN_TXLEN)
8124d307b04Sdv 			goto drop;
8133481ecdfSdv 
8144d307b04Sdv 		/*
8154d307b04Sdv 		 * Packet inspection for ethernet header (if using a "local"
8164d307b04Sdv 		 * interface) for possibility of a DHCP packet or (if using
8174d307b04Sdv 		 * locked lladdr) for validating ethernet header.
8184d307b04Sdv 		 *
8194d307b04Sdv 		 * To help preserve zero-copy semantics, we require the first
8204d307b04Sdv 		 * descriptor with packet data contains a large enough buffer
8214d307b04Sdv 		 * for this inspection.
8224d307b04Sdv 		 */
8234d307b04Sdv 		iov = &iov_tx[0];
8244d307b04Sdv 		if (vionet->lockedmac) {
8254d307b04Sdv 			if (iov->iov_len < ETHER_HDR_LEN) {
8264d307b04Sdv 				log_warnx("%s: insufficient header data",
8274d307b04Sdv 				    __func__);
8284d307b04Sdv 				goto drop;
8294d307b04Sdv 			}
8304d307b04Sdv 			eh = (struct ether_header *)iov->iov_base;
8314d307b04Sdv 			if (memcmp(eh->ether_shost, vionet->mac,
8324d307b04Sdv 			    sizeof(eh->ether_shost)) != 0) {
8334d307b04Sdv 				log_warnx("%s: bad source address %s",
8344d307b04Sdv 				    __func__, ether_ntoa((struct ether_addr *)
8354d307b04Sdv 					eh->ether_shost));
8364d307b04Sdv 				goto drop;
8374d307b04Sdv 			}
8384d307b04Sdv 		}
8394d307b04Sdv 		if (vionet->local) {
8404d307b04Sdv 			dhcpsz = dhcp_request(dev, iov->iov_base, iov->iov_len,
8414d307b04Sdv 			    &dhcppkt);
84282ace5feSjan 			if (dhcpsz > 0) {
8434d307b04Sdv 				log_debug("%s: detected dhcp request of %zu bytes",
8444d307b04Sdv 				    __func__, dhcpsz);
84582ace5feSjan 				goto drop;
84682ace5feSjan 			}
8473481ecdfSdv 		}
8483481ecdfSdv 
8494d307b04Sdv 		/* Write our packet to the tap(4). */
8504d307b04Sdv 		sz = writev(vionet->data_fd, iov_tx, iov_cnt);
8514d307b04Sdv 		if (sz == -1 && errno != ENOBUFS) {
8524d307b04Sdv 			log_warn("%s", __func__);
8534d307b04Sdv 			goto reset;
8543481ecdfSdv 		}
855c01d0122Sjan 		chain_len += sizeof(struct virtio_net_hdr);
8564d307b04Sdv drop:
8574d307b04Sdv 		used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_idx;
858c01d0122Sjan 		used->ring[used->idx & VIONET_QUEUE_MASK].len = chain_len;
8593481ecdfSdv 		__sync_synchronize();
8603481ecdfSdv 		used->idx++;
8614d307b04Sdv 		idx++;
8623481ecdfSdv 
8634d307b04Sdv 		/* Facilitate DHCP reply injection, if needed. */
8644d307b04Sdv 		if (dhcpsz > 0) {
8654d307b04Sdv 			pkt.buf = dhcppkt;
8664d307b04Sdv 			pkt.len = dhcpsz;
8674d307b04Sdv 			sz = write(pipe_inject[WRITE], &pkt, sizeof(pkt));
8684d307b04Sdv 			if (sz == -1 && errno != EAGAIN) {
8694d307b04Sdv 				log_warn("%s: packet injection", __func__);
8704d307b04Sdv 				free(pkt.buf);
8714d307b04Sdv 			} else if (sz == -1 && errno == EAGAIN) {
8724d307b04Sdv 				log_debug("%s: dropping dhcp reply", __func__);
8734d307b04Sdv 				free(pkt.buf);
8744d307b04Sdv 			} else if (sz != sizeof(pkt)) {
8754d307b04Sdv 				log_warnx("%s: failed packet injection",
8764d307b04Sdv 				    __func__);
8774d307b04Sdv 				free(pkt.buf);
8784d307b04Sdv 			}
8794d307b04Sdv 			log_debug("%s: injected dhcp reply with %ld bytes",
8804d307b04Sdv 			    __func__, sz);
8814d307b04Sdv 		}
8823481ecdfSdv 	}
8833481ecdfSdv 
8844d307b04Sdv 	if (idx != vq_info->last_avail &&
885a246f7a0Sdv 	    !(avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
8864d307b04Sdv 		notify = 1;
887a246f7a0Sdv 
8883481ecdfSdv 
8894d307b04Sdv 	vq_info->last_avail = idx;
8904d307b04Sdv 	return (notify);
8914d307b04Sdv reset:
892a246f7a0Sdv 	return (-1);
8933481ecdfSdv }
8943481ecdfSdv 
8953481ecdfSdv static void
8963481ecdfSdv dev_dispatch_vm(int fd, short event, void *arg)
8973481ecdfSdv {
8983481ecdfSdv 	struct virtio_dev	*dev = arg;
8993481ecdfSdv 	struct vionet_dev	*vionet = &dev->vionet;
9003481ecdfSdv 	struct imsgev		*iev = &dev->async_iev;
9013481ecdfSdv 	struct imsgbuf		*ibuf = &iev->ibuf;
9023481ecdfSdv 	struct imsg	 	 imsg;
9033481ecdfSdv 	ssize_t			 n = 0;
90408d0da61Sdv 	int			 verbose;
9053481ecdfSdv 
9063481ecdfSdv 	if (dev == NULL)
9073481ecdfSdv 		fatalx("%s: missing vionet pointer", __func__);
9083481ecdfSdv 
9093481ecdfSdv 	if (event & EV_READ) {
910d12ef5f3Sclaudio 		if ((n = imsgbuf_read(ibuf)) == -1)
911dd7efffeSclaudio 			fatal("%s: imsgbuf_read", __func__);
9123481ecdfSdv 		if (n == 0) {
9133481ecdfSdv 			/* this pipe is dead, so remove the event handler */
9143481ecdfSdv 			log_debug("%s: pipe dead (EV_READ)", __func__);
9153481ecdfSdv 			event_del(&iev->ev);
916a246f7a0Sdv 			event_base_loopexit(ev_base_main, NULL);
9173481ecdfSdv 			return;
9183481ecdfSdv 		}
9193481ecdfSdv 	}
9203481ecdfSdv 
9213481ecdfSdv 	if (event & EV_WRITE) {
922dd7efffeSclaudio 		if (imsgbuf_write(ibuf) == -1) {
923c1aa9554Sclaudio 			if (errno == EPIPE) {
924c1aa9554Sclaudio 				/* this pipe is dead, remove the handler */
9253481ecdfSdv 				log_debug("%s: pipe dead (EV_WRITE)", __func__);
9263481ecdfSdv 				event_del(&iev->ev);
927c1aa9554Sclaudio 				event_loopexit(NULL);
9283481ecdfSdv 				return;
9293481ecdfSdv 			}
930dd7efffeSclaudio 			fatal("%s: imsgbuf_write", __func__);
931c1aa9554Sclaudio 		}
9323481ecdfSdv 	}
9333481ecdfSdv 
9343481ecdfSdv 	for (;;) {
9353481ecdfSdv 		if ((n = imsg_get(ibuf, &imsg)) == -1)
9363481ecdfSdv 			fatal("%s: imsg_get", __func__);
9373481ecdfSdv 		if (n == 0)
9383481ecdfSdv 			break;
9393481ecdfSdv 
9403481ecdfSdv 		switch (imsg.hdr.type) {
9413481ecdfSdv 		case IMSG_DEVOP_HOSTMAC:
9423481ecdfSdv 			IMSG_SIZE_CHECK(&imsg, vionet->hostmac);
9433481ecdfSdv 			memcpy(vionet->hostmac, imsg.data,
9443481ecdfSdv 			    sizeof(vionet->hostmac));
9453481ecdfSdv 			log_debug("%s: set hostmac", __func__);
9463481ecdfSdv 			break;
9473481ecdfSdv 		case IMSG_VMDOP_PAUSE_VM:
9483481ecdfSdv 			log_debug("%s: pausing", __func__);
949a246f7a0Sdv 			vm_pipe_send(&pipe_rx, VIRTIO_THREAD_PAUSE);
9503481ecdfSdv 			break;
9513481ecdfSdv 		case IMSG_VMDOP_UNPAUSE_VM:
9523481ecdfSdv 			log_debug("%s: unpausing", __func__);
9536b07697fSdv 			if (rx_enabled)
954a246f7a0Sdv 				vm_pipe_send(&pipe_rx, VIRTIO_THREAD_START);
9553481ecdfSdv 			break;
95608d0da61Sdv 		case IMSG_CTL_VERBOSE:
95708d0da61Sdv 			IMSG_SIZE_CHECK(&imsg, &verbose);
95808d0da61Sdv 			memcpy(&verbose, imsg.data, sizeof(verbose));
95908d0da61Sdv 			log_setverbose(verbose);
96008d0da61Sdv 			break;
9613481ecdfSdv 		}
9623481ecdfSdv 		imsg_free(&imsg);
9633481ecdfSdv 	}
964a246f7a0Sdv 	imsg_event_add2(iev, ev_base_main);
9653481ecdfSdv }
9663481ecdfSdv 
9673481ecdfSdv /*
9683481ecdfSdv  * Synchronous IO handler.
9693481ecdfSdv  *
9703481ecdfSdv  */
9713481ecdfSdv static void
9723481ecdfSdv handle_sync_io(int fd, short event, void *arg)
9733481ecdfSdv {
9743481ecdfSdv 	struct virtio_dev *dev = (struct virtio_dev *)arg;
9753481ecdfSdv 	struct imsgev *iev = &dev->sync_iev;
9763481ecdfSdv 	struct imsgbuf *ibuf = &iev->ibuf;
9773481ecdfSdv 	struct viodev_msg msg;
9783481ecdfSdv 	struct imsg imsg;
9793481ecdfSdv 	ssize_t n;
9808761e6b4Sdv 	int8_t intr = INTR_STATE_NOOP;
9813481ecdfSdv 
9823481ecdfSdv 	if (event & EV_READ) {
983d12ef5f3Sclaudio 		if ((n = imsgbuf_read(ibuf)) == -1)
984dd7efffeSclaudio 			fatal("%s: imsgbuf_read", __func__);
9853481ecdfSdv 		if (n == 0) {
9863481ecdfSdv 			/* this pipe is dead, so remove the event handler */
9873481ecdfSdv 			log_debug("%s: pipe dead (EV_READ)", __func__);
9883481ecdfSdv 			event_del(&iev->ev);
989a246f7a0Sdv 			event_base_loopexit(ev_base_main, NULL);
9903481ecdfSdv 			return;
9913481ecdfSdv 		}
9923481ecdfSdv 	}
9933481ecdfSdv 
9943481ecdfSdv 	if (event & EV_WRITE) {
995dd7efffeSclaudio 		if (imsgbuf_write(ibuf) == -1) {
996c1aa9554Sclaudio 			if (errno == EPIPE) {
997c1aa9554Sclaudio 				/* this pipe is dead, remove the handler */
9983481ecdfSdv 				log_debug("%s: pipe dead (EV_WRITE)", __func__);
9993481ecdfSdv 				event_del(&iev->ev);
1000c1aa9554Sclaudio 				event_loopexit(NULL);
10013481ecdfSdv 				return;
10023481ecdfSdv 			}
1003dd7efffeSclaudio 			fatal("%s: imsgbuf_write", __func__);
1004c1aa9554Sclaudio 		}
10053481ecdfSdv 	}
10063481ecdfSdv 
10073481ecdfSdv 	for (;;) {
10083481ecdfSdv 		if ((n = imsg_get(ibuf, &imsg)) == -1)
10093481ecdfSdv 			fatalx("%s: imsg_get (n=%ld)", __func__, n);
10103481ecdfSdv 		if (n == 0)
10113481ecdfSdv 			break;
10123481ecdfSdv 
10133481ecdfSdv 		/* Unpack our message. They ALL should be dev messeges! */
10143481ecdfSdv 		IMSG_SIZE_CHECK(&imsg, &msg);
10153481ecdfSdv 		memcpy(&msg, imsg.data, sizeof(msg));
10163481ecdfSdv 		imsg_free(&imsg);
10173481ecdfSdv 
10183481ecdfSdv 		switch (msg.type) {
10193481ecdfSdv 		case VIODEV_MSG_DUMP:
10203481ecdfSdv 			/* Dump device */
10213481ecdfSdv 			n = atomicio(vwrite, dev->sync_fd, dev, sizeof(*dev));
10223481ecdfSdv 			if (n != sizeof(*dev)) {
10234cd4f486Sdv 				log_warnx("%s: failed to dump vionet device",
10243481ecdfSdv 				    __func__);
10253481ecdfSdv 				break;
10263481ecdfSdv 			}
10273481ecdfSdv 		case VIODEV_MSG_IO_READ:
10283481ecdfSdv 			/* Read IO: make sure to send a reply */
10298761e6b4Sdv 			msg.data = handle_io_read(&msg, dev, &intr);
10303481ecdfSdv 			msg.data_valid = 1;
10318761e6b4Sdv 			msg.state = intr;
1032a246f7a0Sdv 			imsg_compose_event2(iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
1033a246f7a0Sdv 			    sizeof(msg), ev_base_main);
10343481ecdfSdv 			break;
10353481ecdfSdv 		case VIODEV_MSG_IO_WRITE:
10363481ecdfSdv 			/* Write IO: no reply needed */
1037a246f7a0Sdv 			handle_io_write(&msg, dev);
10383481ecdfSdv 			break;
10393481ecdfSdv 		case VIODEV_MSG_SHUTDOWN:
10403481ecdfSdv 			event_del(&dev->sync_iev.ev);
1041a246f7a0Sdv 			event_base_loopbreak(ev_base_main);
10423481ecdfSdv 			return;
10433481ecdfSdv 		default:
10443481ecdfSdv 			fatalx("%s: invalid msg type %d", __func__, msg.type);
10453481ecdfSdv 		}
10463481ecdfSdv 	}
1047a246f7a0Sdv 	imsg_event_add2(iev, ev_base_main);
10483481ecdfSdv }
10493481ecdfSdv 
1050a246f7a0Sdv static void
10513481ecdfSdv handle_io_write(struct viodev_msg *msg, struct virtio_dev *dev)
10523481ecdfSdv {
10533481ecdfSdv 	struct vionet_dev	*vionet = &dev->vionet;
10543481ecdfSdv 	uint32_t		 data = msg->data;
1055a246f7a0Sdv 	int			 pause_devices = 0;
1056a246f7a0Sdv 
1057a246f7a0Sdv 	pthread_rwlock_wrlock(&lock);
10583481ecdfSdv 
10593481ecdfSdv 	switch (msg->reg) {
10603481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_FEATURES:
10613481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_SIZE:
10623481ecdfSdv 	case VIRTIO_CONFIG_ISR_STATUS:
10633481ecdfSdv 		log_warnx("%s: illegal write %x to %s", __progname, data,
10643481ecdfSdv 		    virtio_reg_name(msg->reg));
10653481ecdfSdv 		break;
10663481ecdfSdv 	case VIRTIO_CONFIG_GUEST_FEATURES:
10673481ecdfSdv 		vionet->cfg.guest_feature = data;
10683481ecdfSdv 		break;
10693481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_PFN:
10703481ecdfSdv 		vionet->cfg.queue_pfn = data;
10713481ecdfSdv 		vionet_update_qa(vionet);
10723481ecdfSdv 		break;
10733481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_SELECT:
10743481ecdfSdv 		vionet->cfg.queue_select = data;
10753481ecdfSdv 		vionet_update_qs(vionet);
10763481ecdfSdv 		break;
10773481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_NOTIFY:
10783481ecdfSdv 		vionet->cfg.queue_notify = data;
1079a246f7a0Sdv 		vionet_notifyq(dev);
10803481ecdfSdv 		break;
10813481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_STATUS:
1082a246f7a0Sdv 		if (data == 0) {
1083a246f7a0Sdv 			resetting = 2;	/* Wait on two acks: rx & tx */
1084a246f7a0Sdv 			pause_devices = 1;
1085a246f7a0Sdv 		} else {
1086a246f7a0Sdv 			// XXX is this correct?
10873481ecdfSdv 			vionet->cfg.device_status = data;
10883481ecdfSdv 		}
10893481ecdfSdv 		break;
10903481ecdfSdv 	}
1091a246f7a0Sdv 
1092a246f7a0Sdv 	pthread_rwlock_unlock(&lock);
1093a246f7a0Sdv 	if (pause_devices) {
10946b07697fSdv 		rx_enabled = 0;
1095a246f7a0Sdv 		vionet_deassert_pic_irq(dev);
1096a246f7a0Sdv 		vm_pipe_send(&pipe_rx, VIRTIO_THREAD_PAUSE);
1097a246f7a0Sdv 		vm_pipe_send(&pipe_tx, VIRTIO_THREAD_PAUSE);
1098a246f7a0Sdv 	}
10993481ecdfSdv }
11003481ecdfSdv 
11013481ecdfSdv static uint32_t
11028761e6b4Sdv handle_io_read(struct viodev_msg *msg, struct virtio_dev *dev, int8_t *intr)
11033481ecdfSdv {
11043481ecdfSdv 	struct vionet_dev *vionet = &dev->vionet;
11053481ecdfSdv 	uint32_t data;
11063481ecdfSdv 
1107a246f7a0Sdv 	pthread_rwlock_rdlock(&lock);
1108a246f7a0Sdv 
11093481ecdfSdv 	switch (msg->reg) {
11103481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
11113481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
11123481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
11133481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
11143481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
11153481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
11163481ecdfSdv 		data = vionet->mac[msg->reg -
11173481ecdfSdv 		    VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI];
11183481ecdfSdv 		break;
11193481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_FEATURES:
11203481ecdfSdv 		data = vionet->cfg.device_feature;
11213481ecdfSdv 		break;
11223481ecdfSdv 	case VIRTIO_CONFIG_GUEST_FEATURES:
11233481ecdfSdv 		data = vionet->cfg.guest_feature;
11243481ecdfSdv 		break;
11253481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_PFN:
11263481ecdfSdv 		data = vionet->cfg.queue_pfn;
11273481ecdfSdv 		break;
11283481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_SIZE:
11293481ecdfSdv 		data = vionet->cfg.queue_size;
11303481ecdfSdv 		break;
11313481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_SELECT:
11323481ecdfSdv 		data = vionet->cfg.queue_select;
11333481ecdfSdv 		break;
11343481ecdfSdv 	case VIRTIO_CONFIG_QUEUE_NOTIFY:
11353481ecdfSdv 		data = vionet->cfg.queue_notify;
11363481ecdfSdv 		break;
11373481ecdfSdv 	case VIRTIO_CONFIG_DEVICE_STATUS:
11383481ecdfSdv 		data = vionet->cfg.device_status;
11393481ecdfSdv 		break;
11403481ecdfSdv 	case VIRTIO_CONFIG_ISR_STATUS:
1141a246f7a0Sdv 		pthread_rwlock_unlock(&lock);
1142a246f7a0Sdv 		pthread_rwlock_wrlock(&lock);
11433481ecdfSdv 		data = vionet->cfg.isr_status;
11443481ecdfSdv 		vionet->cfg.isr_status = 0;
11458761e6b4Sdv 		if (intr != NULL)
11468761e6b4Sdv 			*intr = INTR_STATE_DEASSERT;
11473481ecdfSdv 		break;
11483481ecdfSdv 	default:
1149a246f7a0Sdv 		data = 0xFFFFFFFF;
11503481ecdfSdv 	}
11513481ecdfSdv 
1152a246f7a0Sdv 	pthread_rwlock_unlock(&lock);
11533481ecdfSdv 	return (data);
11543481ecdfSdv }
1155a246f7a0Sdv 
1156a246f7a0Sdv /*
1157a246f7a0Sdv  * Handle the rx side processing, communicating to the main thread via pipe.
1158a246f7a0Sdv  */
1159a246f7a0Sdv static void *
1160a246f7a0Sdv rx_run_loop(void *arg)
1161a246f7a0Sdv {
1162a246f7a0Sdv 	struct virtio_dev	*dev = (struct virtio_dev *)arg;
1163a246f7a0Sdv 	struct vionet_dev	*vionet = &dev->vionet;
1164a246f7a0Sdv 	int			 ret;
1165a246f7a0Sdv 
1166a246f7a0Sdv 	ev_base_rx = event_base_new();
1167a246f7a0Sdv 
1168a246f7a0Sdv 	/* Wire up event handling for the tap fd. */
1169a246f7a0Sdv 	event_set(&ev_tap, vionet->data_fd, EV_READ | EV_PERSIST,
1170a246f7a0Sdv 	    vionet_rx_event, dev);
1171a246f7a0Sdv 	event_base_set(ev_base_rx, &ev_tap);
1172a246f7a0Sdv 
1173a246f7a0Sdv 	/* Wire up event handling for the packet injection pipe. */
1174a246f7a0Sdv 	event_set(&ev_inject, pipe_inject[READ], EV_READ | EV_PERSIST,
1175a246f7a0Sdv 	    vionet_rx_event, dev);
1176a246f7a0Sdv 	event_base_set(ev_base_rx, &ev_inject);
1177a246f7a0Sdv 
1178a246f7a0Sdv 	/* Wire up event handling for our inter-thread communication channel. */
1179a246f7a0Sdv 	event_base_set(ev_base_rx, &pipe_rx.read_ev);
1180a246f7a0Sdv 	event_add(&pipe_rx.read_ev, NULL);
1181a246f7a0Sdv 
1182a246f7a0Sdv 	/* Begin our event loop with our channel event active. */
1183a246f7a0Sdv 	ret = event_base_dispatch(ev_base_rx);
1184a246f7a0Sdv 	event_base_free(ev_base_rx);
1185a246f7a0Sdv 
1186a246f7a0Sdv 	log_debug("%s: exiting (%d)", __func__, ret);
1187a246f7a0Sdv 
1188a246f7a0Sdv 	close_fd(pipe_rx.read);
1189a246f7a0Sdv 	close_fd(pipe_inject[READ]);
1190a246f7a0Sdv 
1191a246f7a0Sdv 	return (NULL);
1192a246f7a0Sdv }
1193a246f7a0Sdv 
1194a246f7a0Sdv /*
1195a246f7a0Sdv  * Handle the tx side processing, communicating to the main thread via pipe.
1196a246f7a0Sdv  */
1197a246f7a0Sdv static void *
1198a246f7a0Sdv tx_run_loop(void *arg)
1199a246f7a0Sdv {
1200a246f7a0Sdv 	int			 ret;
1201a246f7a0Sdv 
1202a246f7a0Sdv 	ev_base_tx = event_base_new();
1203a246f7a0Sdv 
1204a246f7a0Sdv 	/* Wire up event handling for our inter-thread communication channel. */
1205a246f7a0Sdv 	event_base_set(ev_base_tx, &pipe_tx.read_ev);
1206a246f7a0Sdv 	event_add(&pipe_tx.read_ev, NULL);
1207a246f7a0Sdv 
1208a246f7a0Sdv 	/* Begin our event loop with our channel event active. */
1209a246f7a0Sdv 	ret = event_base_dispatch(ev_base_tx);
1210a246f7a0Sdv 	event_base_free(ev_base_tx);
1211a246f7a0Sdv 
1212a246f7a0Sdv 	log_debug("%s: exiting (%d)", __func__, ret);
1213a246f7a0Sdv 
1214a246f7a0Sdv 	close_fd(pipe_tx.read);
1215a246f7a0Sdv 
1216a246f7a0Sdv 	return (NULL);
1217a246f7a0Sdv }
1218a246f7a0Sdv 
1219a246f7a0Sdv /*
1220a246f7a0Sdv  * Read events sent by the main thread to the rx thread.
1221a246f7a0Sdv  */
1222a246f7a0Sdv static void
1223a246f7a0Sdv read_pipe_rx(int fd, short event, void *arg)
1224a246f7a0Sdv {
1225a246f7a0Sdv 	enum pipe_msg_type	msg;
1226a246f7a0Sdv 
1227a246f7a0Sdv 	if (!(event & EV_READ))
1228a246f7a0Sdv 		fatalx("%s: invalid event type", __func__);
1229a246f7a0Sdv 
1230a246f7a0Sdv 	msg = vm_pipe_recv(&pipe_rx);
1231a246f7a0Sdv 
1232a246f7a0Sdv 	switch (msg) {
1233a246f7a0Sdv 	case VIRTIO_NOTIFY:
1234a246f7a0Sdv 	case VIRTIO_THREAD_START:
1235a246f7a0Sdv 		event_add(&ev_tap, NULL);
1236a246f7a0Sdv 		event_add(&ev_inject, NULL);
1237a246f7a0Sdv 		break;
1238a246f7a0Sdv 	case VIRTIO_THREAD_PAUSE:
1239a246f7a0Sdv 		event_del(&ev_tap);
1240a246f7a0Sdv 		event_del(&ev_inject);
1241a246f7a0Sdv 		vm_pipe_send(&pipe_main, VIRTIO_THREAD_ACK);
1242a246f7a0Sdv 		break;
1243a246f7a0Sdv 	case VIRTIO_THREAD_STOP:
1244a246f7a0Sdv 		event_del(&ev_tap);
1245a246f7a0Sdv 		event_del(&ev_inject);
1246a246f7a0Sdv 		event_base_loopexit(ev_base_rx, NULL);
1247a246f7a0Sdv 		break;
1248a246f7a0Sdv 	default:
1249a246f7a0Sdv 		fatalx("%s: invalid channel message: %d", __func__, msg);
1250a246f7a0Sdv 	}
1251a246f7a0Sdv }
1252a246f7a0Sdv 
1253a246f7a0Sdv /*
1254a246f7a0Sdv  * Read events sent by the main thread to the tx thread.
1255a246f7a0Sdv  */
1256a246f7a0Sdv static void
1257a246f7a0Sdv read_pipe_tx(int fd, short event, void *arg)
1258a246f7a0Sdv {
1259a246f7a0Sdv 	struct virtio_dev	*dev = (struct virtio_dev*)arg;
1260a246f7a0Sdv 	struct vionet_dev	*vionet = &dev->vionet;
1261a246f7a0Sdv 	enum pipe_msg_type	 msg;
1262a246f7a0Sdv 	int			 ret = 0;
1263a246f7a0Sdv 
1264a246f7a0Sdv 	if (!(event & EV_READ))
1265a246f7a0Sdv 		fatalx("%s: invalid event type", __func__);
1266a246f7a0Sdv 
1267a246f7a0Sdv 	msg = vm_pipe_recv(&pipe_tx);
1268a246f7a0Sdv 
1269a246f7a0Sdv 	switch (msg) {
1270a246f7a0Sdv 	case VIRTIO_NOTIFY:
1271a246f7a0Sdv 		pthread_rwlock_rdlock(&lock);
1272a246f7a0Sdv 		ret = vionet_tx(dev);
1273a246f7a0Sdv 		pthread_rwlock_unlock(&lock);
1274a246f7a0Sdv 		break;
1275a246f7a0Sdv 	case VIRTIO_THREAD_START:
1276a246f7a0Sdv 		/* Ignore Start messages. */
1277a246f7a0Sdv 		break;
1278a246f7a0Sdv 	case VIRTIO_THREAD_PAUSE:
1279a246f7a0Sdv 		/*
1280a246f7a0Sdv 		 * Nothing to do when pausing on the tx side, but ACK so main
1281a246f7a0Sdv 		 * thread knows we're not transmitting.
1282a246f7a0Sdv 		 */
1283a246f7a0Sdv 		vm_pipe_send(&pipe_main, VIRTIO_THREAD_ACK);
1284a246f7a0Sdv 		break;
1285a246f7a0Sdv 	case VIRTIO_THREAD_STOP:
1286a246f7a0Sdv 		event_base_loopexit(ev_base_tx, NULL);
1287a246f7a0Sdv 		break;
1288a246f7a0Sdv 	default:
1289a246f7a0Sdv 		fatalx("%s: invalid channel message: %d", __func__, msg);
1290a246f7a0Sdv 	}
1291a246f7a0Sdv 
1292a246f7a0Sdv 	if (ret == 0) {
1293a246f7a0Sdv 		/* No notification needed. Return early. */
1294a246f7a0Sdv 		return;
1295a246f7a0Sdv 	}
1296a246f7a0Sdv 
1297a246f7a0Sdv 	pthread_rwlock_wrlock(&lock);
1298a246f7a0Sdv 	if (ret == 1) {
1299a246f7a0Sdv 		/* Notify the driver. */
1300a246f7a0Sdv 		vionet->cfg.isr_status |= 1;
1301a246f7a0Sdv 	} else {
1302a246f7a0Sdv 		/* Need a reset. Something went wrong. */
1303a246f7a0Sdv 		log_warnx("%s: requesting device reset", __func__);
1304a246f7a0Sdv 		vionet->cfg.device_status |= DEVICE_NEEDS_RESET;
1305a246f7a0Sdv 		vionet->cfg.isr_status |= VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
1306a246f7a0Sdv 	}
1307a246f7a0Sdv 	pthread_rwlock_unlock(&lock);
1308a246f7a0Sdv 
1309a246f7a0Sdv 	vm_pipe_send(&pipe_main, VIRTIO_RAISE_IRQ);
1310a246f7a0Sdv }
1311a246f7a0Sdv 
1312a246f7a0Sdv /*
1313a246f7a0Sdv  * Read events sent by the rx/tx threads to the main thread.
1314a246f7a0Sdv  */
1315a246f7a0Sdv static void
1316a246f7a0Sdv read_pipe_main(int fd, short event, void *arg)
1317a246f7a0Sdv {
1318a246f7a0Sdv 	struct virtio_dev	*dev = (struct virtio_dev*)arg;
1319a246f7a0Sdv 	struct vionet_dev	*vionet = &dev->vionet;
1320a246f7a0Sdv 	enum pipe_msg_type	 msg;
1321a246f7a0Sdv 
1322a246f7a0Sdv 	if (!(event & EV_READ))
1323a246f7a0Sdv 		fatalx("%s: invalid event type", __func__);
1324a246f7a0Sdv 
1325a246f7a0Sdv 	msg = vm_pipe_recv(&pipe_main);
1326a246f7a0Sdv 	switch (msg) {
1327a246f7a0Sdv 	case VIRTIO_RAISE_IRQ:
1328a246f7a0Sdv 		vionet_assert_pic_irq(dev);
1329a246f7a0Sdv 		break;
1330a246f7a0Sdv 	case VIRTIO_THREAD_ACK:
1331a246f7a0Sdv 		resetting--;
1332a246f7a0Sdv 		if (resetting == 0) {
1333a246f7a0Sdv 			log_debug("%s: resetting virtio network device %d",
1334a246f7a0Sdv 			    __func__, vionet->idx);
1335a246f7a0Sdv 
1336a246f7a0Sdv 			pthread_rwlock_wrlock(&lock);
1337a246f7a0Sdv 			vionet->cfg.device_status = 0;
1338a246f7a0Sdv 			vionet->cfg.guest_feature = 0;
1339a246f7a0Sdv 			vionet->cfg.queue_pfn = 0;
1340a246f7a0Sdv 			vionet_update_qa(vionet);
1341a246f7a0Sdv 			vionet->cfg.queue_size = 0;
1342a246f7a0Sdv 			vionet_update_qs(vionet);
1343a246f7a0Sdv 			vionet->cfg.queue_select = 0;
1344a246f7a0Sdv 			vionet->cfg.queue_notify = 0;
1345a246f7a0Sdv 			vionet->cfg.isr_status = 0;
1346a246f7a0Sdv 			vionet->vq[RXQ].last_avail = 0;
1347a246f7a0Sdv 			vionet->vq[RXQ].notified_avail = 0;
1348a246f7a0Sdv 			vionet->vq[TXQ].last_avail = 0;
1349a246f7a0Sdv 			vionet->vq[TXQ].notified_avail = 0;
1350a246f7a0Sdv 			pthread_rwlock_unlock(&lock);
1351a246f7a0Sdv 		}
1352a246f7a0Sdv 		break;
1353a246f7a0Sdv 	default:
1354a246f7a0Sdv 		fatalx("%s: invalid channel msg: %d", __func__, msg);
1355a246f7a0Sdv 	}
1356a246f7a0Sdv }
1357a246f7a0Sdv 
1358a246f7a0Sdv /*
1359a246f7a0Sdv  * Message the vm process asking to raise the irq. Must be called from the main
1360a246f7a0Sdv  * thread.
1361a246f7a0Sdv  */
1362a246f7a0Sdv static void
1363a246f7a0Sdv vionet_assert_pic_irq(struct virtio_dev *dev)
1364a246f7a0Sdv {
1365a246f7a0Sdv 	struct viodev_msg	msg;
1366a246f7a0Sdv 	int			ret;
1367a246f7a0Sdv 
1368a246f7a0Sdv 	memset(&msg, 0, sizeof(msg));
1369a246f7a0Sdv 	msg.irq = dev->irq;
1370a246f7a0Sdv 	msg.vcpu = 0; // XXX
1371a246f7a0Sdv 	msg.type = VIODEV_MSG_KICK;
1372a246f7a0Sdv 	msg.state = INTR_STATE_ASSERT;
1373a246f7a0Sdv 
1374a246f7a0Sdv 	ret = imsg_compose_event2(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
1375a246f7a0Sdv 	    &msg, sizeof(msg), ev_base_main);
1376a246f7a0Sdv 	if (ret == -1)
1377a246f7a0Sdv 		log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
1378a246f7a0Sdv }
1379a246f7a0Sdv 
1380a246f7a0Sdv /*
1381a246f7a0Sdv  * Message the vm process asking to lower the irq. Must be called from the main
1382a246f7a0Sdv  * thread.
1383a246f7a0Sdv  */
1384a246f7a0Sdv static void
1385a246f7a0Sdv vionet_deassert_pic_irq(struct virtio_dev *dev)
1386a246f7a0Sdv {
1387a246f7a0Sdv 	struct viodev_msg	msg;
1388a246f7a0Sdv 	int			ret;
1389a246f7a0Sdv 
1390a246f7a0Sdv 	memset(&msg, 0, sizeof(msg));
1391a246f7a0Sdv 	msg.irq = dev->irq;
1392a246f7a0Sdv 	msg.vcpu = 0; // XXX
1393a246f7a0Sdv 	msg.type = VIODEV_MSG_KICK;
1394a246f7a0Sdv 	msg.state = INTR_STATE_DEASSERT;
1395a246f7a0Sdv 
1396a246f7a0Sdv 	ret = imsg_compose_event2(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
1397a246f7a0Sdv 	    &msg, sizeof(msg), ev_base_main);
1398a246f7a0Sdv 	if (ret == -1)
1399a246f7a0Sdv 		log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
1400a246f7a0Sdv }
1401