xref: /openbsd-src/usr.sbin/vmd/vmm.c (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1 /*	$OpenBSD: vmm.c,v 1.44 2016/09/03 11:38:08 mlarkin Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* nitems */
20 #include <sys/ioctl.h>
21 #include <sys/queue.h>
22 #include <sys/wait.h>
23 #include <sys/uio.h>
24 #include <sys/socket.h>
25 #include <sys/time.h>
26 #include <sys/mman.h>
27 
28 #include <dev/ic/i8253reg.h>
29 #include <dev/isa/isareg.h>
30 #include <dev/pci/pcireg.h>
31 
32 #include <machine/param.h>
33 #include <machine/psl.h>
34 #include <machine/specialreg.h>
35 #include <machine/vmmvar.h>
36 
37 #include <errno.h>
38 #include <event.h>
39 #include <fcntl.h>
40 #include <imsg.h>
41 #include <limits.h>
42 #include <poll.h>
43 #include <pthread.h>
44 #include <stddef.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <unistd.h>
49 #include <util.h>
50 
51 #include "vmd.h"
52 #include "vmm.h"
53 #include "loadfile.h"
54 #include "pci.h"
55 #include "virtio.h"
56 #include "proc.h"
57 #include "i8253.h"
58 #include "i8259.h"
59 #include "ns8250.h"
60 #include "mc146818.h"
61 
62 io_fn_t ioports_map[MAX_PORTS];
63 
64 void vmm_sighdlr(int, short, void *);
65 int start_client_vmd(void);
66 int opentap(void);
67 int start_vm(struct imsg *, uint32_t *);
68 int terminate_vm(struct vm_terminate_params *);
69 int get_info_vm(struct privsep *, struct imsg *, int);
70 int run_vm(int *, int *, struct vm_create_params *, struct vcpu_reg_state *);
71 void *event_thread(void *);
72 void *vcpu_run_loop(void *);
73 int vcpu_exit(struct vm_run_params *);
74 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
75 void create_memory_map(struct vm_create_params *);
76 int alloc_guest_mem(struct vm_create_params *);
77 int vmm_create_vm(struct vm_create_params *);
78 void init_emulated_hw(struct vm_create_params *, int *, int *);
79 void vcpu_exit_inout(struct vm_run_params *);
80 uint8_t vcpu_exit_pci(struct vm_run_params *);
81 int vmm_dispatch_parent(int, struct privsep_proc *, struct imsg *);
82 void vmm_run(struct privsep *, struct privsep_proc *, void *);
83 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
84 
85 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
86     size_t);
87 
88 int con_fd;
89 struct vmd_vm *current_vm;
90 
91 extern struct vmd *env;
92 
93 extern char *__progname;
94 
95 pthread_mutex_t threadmutex;
96 pthread_cond_t threadcond;
97 
98 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
99 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
100 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
101 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
102 
103 static struct privsep_proc procs[] = {
104 	{ "parent",	PROC_PARENT,	vmm_dispatch_parent  },
105 };
106 
107 /*
108  * Represents a standard register set for an OS to be booted
109  * as a flat 32 bit address space, before paging is enabled.
110  *
111  * NOT set here are:
112  *  RIP
113  *  RSP
114  *  GDTR BASE
115  *
116  * Specific bootloaders should clone this structure and override
117  * those fields as needed.
118  *
119  * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
120  *        features of the CPU in use.
121  */
122 static const struct vcpu_reg_state vcpu_init_flat32 = {
123 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
124 	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
125 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
126 	.vrs_crs[VCPU_REGS_CR0] = CR0_CD | CR0_NW | CR0_ET | CR0_PE | CR0_PG,
127 	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
128 	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
129 	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
130 	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
131 	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
132 	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
133 	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
134 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
135 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
136 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
137 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
138 };
139 
140 pid_t
141 vmm(struct privsep *ps, struct privsep_proc *p)
142 {
143 	return (proc_run(ps, p, procs, nitems(procs), vmm_run, NULL));
144 }
145 
146 void
147 vmm_run(struct privsep *ps, struct privsep_proc *p, void *arg)
148 {
149 	if (config_init(ps->ps_env) == -1)
150 		fatal("failed to initialize configuration");
151 
152 	signal_del(&ps->ps_evsigchld);
153 	signal_set(&ps->ps_evsigchld, SIGCHLD, vmm_sighdlr, ps);
154 	signal_add(&ps->ps_evsigchld, NULL);
155 
156 #if 0
157 	/*
158 	 * pledge in the vmm process:
159  	 * stdio - for malloc and basic I/O including events.
160 	 * vmm - for the vmm ioctls and operations.
161 	 * proc - for forking and maitaining vms.
162 	 * recvfd - for disks, interfaces and other fds.
163 	 */
164 	/* XXX'ed pledge to hide it from grep as long as it's disabled */
165 	if (XXX("stdio vmm recvfd proc", NULL) == -1)
166 		fatal("pledge");
167 #endif
168 
169 	/* Get and terminate all running VMs */
170 	get_info_vm(ps, NULL, 1);
171 }
172 
173 int
174 vmm_dispatch_parent(int fd, struct privsep_proc *p, struct imsg *imsg)
175 {
176 	struct privsep		*ps = p->p_ps;
177 	int			 res = 0, cmd = 0;
178 	struct vm_create_params	 vcp;
179 	struct vm_terminate_params vtp;
180 	struct vmop_result	 vmr;
181 	uint32_t		 id = 0;
182 	struct vmd_vm		*vm;
183 
184 	switch (imsg->hdr.type) {
185 	case IMSG_VMDOP_START_VM_REQUEST:
186 		IMSG_SIZE_CHECK(imsg, &vcp);
187 		memcpy(&vcp, imsg->data, sizeof(vcp));
188 		res = config_getvm(ps, &vcp, imsg->fd, imsg->hdr.peerid);
189 		if (res == -1) {
190 			res = errno;
191 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
192 		}
193 		break;
194 	case IMSG_VMDOP_START_VM_DISK:
195 		res = config_getdisk(ps, imsg);
196 		if (res == -1) {
197 			res = errno;
198 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
199 		}
200 		break;
201 	case IMSG_VMDOP_START_VM_IF:
202 		res = config_getif(ps, imsg);
203 		if (res == -1) {
204 			res = errno;
205 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
206 		}
207 		break;
208 	case IMSG_VMDOP_START_VM_END:
209 		res = start_vm(imsg, &id);
210 		cmd = IMSG_VMDOP_START_VM_RESPONSE;
211 		break;
212 	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
213 		IMSG_SIZE_CHECK(imsg, &vtp);
214 		memcpy(&vtp, imsg->data, sizeof(vtp));
215 		id = vtp.vtp_vm_id;
216 		res = terminate_vm(&vtp);
217 		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
218 		if (res == 0) {
219 			/* Remove local reference */
220 			vm = vm_getbyid(id);
221 			vm_remove(vm);
222 		}
223 		break;
224 	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
225 		res = get_info_vm(ps, imsg, 0);
226 		cmd = IMSG_VMDOP_GET_INFO_VM_END_DATA;
227 		break;
228 	case IMSG_CTL_RESET:
229 		config_getreset(env, imsg);
230 		break;
231 	default:
232 		return (-1);
233 	}
234 
235 	switch (cmd) {
236 	case 0:
237 		break;
238 	case IMSG_VMDOP_START_VM_RESPONSE:
239 		if (res != 0) {
240 			vm = vm_getbyvmid(imsg->hdr.peerid);
241 			vm_remove(vm);
242 		}
243 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
244 		memset(&vmr, 0, sizeof(vmr));
245 		vmr.vmr_result = res;
246 		vmr.vmr_id = id;
247 		if (proc_compose_imsg(ps, PROC_PARENT, -1, cmd,
248 		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
249 			return (-1);
250 		break;
251 	default:
252 		if (proc_compose_imsg(ps, PROC_PARENT, -1, cmd,
253 		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
254 			return (-1);
255 		break;
256 	}
257 
258 	return (0);
259 }
260 
261 void
262 vmm_sighdlr(int sig, short event, void *arg)
263 {
264 	struct privsep *ps = arg;
265 	int status;
266 	uint32_t vmid;
267 	pid_t pid;
268 	struct vmop_result vmr;
269 	struct vmd_vm *vm;
270 	struct vm_terminate_params vtp;
271 
272 	switch (sig) {
273 	case SIGCHLD:
274 		do {
275 			pid = waitpid(-1, &status, WNOHANG);
276 			if (pid <= 0)
277 				continue;
278 
279 			if (WIFEXITED(status) || WIFSIGNALED(status)) {
280 				vm = vm_getbypid(pid);
281 				if (vm == NULL) {
282 					/*
283 					 * If the VM is gone already, it
284 					 * got terminated via a
285 					 * IMSG_VMDOP_TERMINATE_VM_REQUEST.
286 					 */
287 					continue;
288 				}
289 
290 				vmid = vm->vm_params.vcp_id;
291 				vtp.vtp_vm_id = vmid;
292 				if (terminate_vm(&vtp) == 0) {
293 					memset(&vmr, 0, sizeof(vmr));
294 					vmr.vmr_result = 0;
295 					vmr.vmr_id = vmid;
296 					vm_remove(vm);
297 					if (proc_compose_imsg(ps, PROC_PARENT,
298 					    -1, IMSG_VMDOP_TERMINATE_VM_EVENT,
299 					    0, -1, &vmr, sizeof(vmr)) == -1)
300 						log_warnx("could not signal "
301 						    "termination of VM %u to "
302 						    "parent", vmid);
303 				} else
304 					log_warnx("could not terminate VM %u",
305 					    vmid);
306 			} else
307 				fatalx("unexpected cause of SIGCHLD");
308 		} while (pid > 0 || (pid == -1 && errno == EINTR));
309 		break;
310 	default:
311 		fatalx("unexpected signal");
312 	}
313 }
314 
315 /*
316  * vcpu_reset
317  *
318  * Requests vmm(4) to reset the VCPUs in the indicated VM to
319  * the register state provided
320  *
321  * Parameters
322  *  vmid: VM ID to reset
323  *  vcpu_id: VCPU ID to reset
324  *  vrs: the register state to initialize
325  *
326  * Return values:
327  *  0: success
328  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
329  *      valid)
330  */
331 int
332 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
333 {
334 	struct vm_resetcpu_params vrp;
335 
336 	memset(&vrp, 0, sizeof(vrp));
337 	vrp.vrp_vm_id = vmid;
338 	vrp.vrp_vcpu_id = vcpu_id;
339 	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
340 
341 	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
342 
343 	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) < 0)
344 		return (errno);
345 
346 	return (0);
347 }
348 
349 /*
350  * terminate_vm
351  *
352  * Requests vmm(4) to terminate the VM whose ID is provided in the
353  * supplied vm_terminate_params structure (vtp->vtp_vm_id)
354  *
355  * Parameters
356  *  vtp: vm_create_params struct containing the ID of the VM to terminate
357  *
358  * Return values:
359  *  0: success
360  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM is not
361  *      valid)
362  */
363 int
364 terminate_vm(struct vm_terminate_params *vtp)
365 {
366 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, vtp) < 0)
367 		return (errno);
368 
369 	return (0);
370 }
371 
372 /*
373  * opentap
374  *
375  * Opens the next available tap device, up to MAX_TAP.
376  *
377  * Returns a file descriptor to the tap node opened, or -1 if no tap
378  * devices were available.
379  */
380 int
381 opentap(void)
382 {
383 	int i, fd;
384 	char path[PATH_MAX];
385 
386 	for (i = 0; i < MAX_TAP; i++) {
387 		snprintf(path, PATH_MAX, "/dev/tap%d", i);
388 		fd = open(path, O_RDWR | O_NONBLOCK);
389 		if (fd != -1)
390 			return (fd);
391 	}
392 
393 	return (-1);
394 }
395 
396 /*
397  * start_vm
398  *
399  * Starts a new VM with the creation parameters supplied (in the incoming
400  * imsg->data field). This function performs a basic sanity check on the
401  * incoming parameters and then performs the following steps to complete
402  * the creation of the VM:
403  *
404  * 1. opens the VM disk image files specified in the VM creation parameters
405  * 2. opens the specified VM kernel
406  * 3. creates a VM console tty pair using openpty
407  * 4. forks, passing the file descriptors opened in steps 1-3 to the child
408  *     vmd responsible for dropping privilege and running the VM's VCPU
409  *     loops.
410  *
411  * Parameters:
412  *  imsg: The incoming imsg body whose 'data' field is a vm_create_params
413  *      struct containing the VM creation parameters.
414  *  id: Returns the VM id as reported by the kernel.
415  *
416  * Return values:
417  *  0: success
418  *  !0 : failure - typically an errno indicating the source of the failure
419  */
420 int
421 start_vm(struct imsg *imsg, uint32_t *id)
422 {
423 	struct vm_create_params	*vcp;
424 	struct vmd_vm		*vm;
425 	size_t			 i;
426 	int			 ret = EINVAL;
427 	int			 fds[2];
428 	struct vcpu_reg_state vrs;
429 
430 	if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
431 		log_warnx("%s: can't find vm", __func__);
432 		ret = ENOENT;
433 		goto err;
434 	}
435 	vcp = &vm->vm_params;
436 
437 	if ((vm->vm_tty = imsg->fd) == -1) {
438 		log_warnx("%s: can't get tty", __func__);
439 		goto err;
440 	}
441 
442 	if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, fds) == -1)
443 		fatal("socketpair");
444 
445 	/* Start child vmd for this VM (fork, chroot, drop privs) */
446 	ret = start_client_vmd();
447 
448 	/* Start child failed? - cleanup and leave */
449 	if (ret == -1) {
450 		log_warnx("%s: start child failed", __func__);
451 		ret = EIO;
452 		goto err;
453 	}
454 
455 	if (ret > 0) {
456 		/* Parent */
457 		vm->vm_pid = ret;
458 
459 		for (i = 0 ; i < vcp->vcp_ndisks; i++) {
460 			close(vm->vm_disks[i]);
461 			vm->vm_disks[i] = -1;
462 		}
463 
464 		for (i = 0 ; i < vcp->vcp_nnics; i++) {
465 			close(vm->vm_ifs[i]);
466 			vm->vm_ifs[i] = -1;
467 		}
468 
469 		close(vm->vm_kernel);
470 		vm->vm_kernel = -1;
471 
472 		close(vm->vm_tty);
473 		vm->vm_tty = -1;
474 
475 		/* read back the kernel-generated vm id from the child */
476 		close(fds[1]);
477 		if (read(fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
478 		    sizeof(vcp->vcp_id))
479 			fatal("read vcp id");
480 		close(fds[0]);
481 
482 		if (vcp->vcp_id == 0)
483 			goto err;
484 
485 		*id = vcp->vcp_id;
486 
487 		return (0);
488 	} else {
489 		/* Child */
490 		setproctitle("%s", vcp->vcp_name);
491 		log_procinit(vcp->vcp_name);
492 
493 		create_memory_map(vcp);
494 		ret = alloc_guest_mem(vcp);
495 		if (ret) {
496 			errno = ret;
497 			fatal("could not allocate guest memory - exiting");
498 		}
499 
500 		ret = vmm_create_vm(vcp);
501 		current_vm = vm;
502 
503 		/* send back the kernel-generated vm id (0 on error) */
504 		close(fds[0]);
505 		if (write(fds[1], &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
506 		    sizeof(vcp->vcp_id))
507 			fatal("write vcp id");
508 		close(fds[1]);
509 
510 		if (ret) {
511 			errno = ret;
512 			fatal("create vmm ioctl failed - exiting");
513 		}
514 
515 #if 0
516 		/*
517 		 * pledge in the vm processes:
518 	 	 * stdio - for malloc and basic I/O including events.
519 		 * vmm - for the vmm ioctls and operations.
520 		 */
521 		if (XXX("stdio vmm", NULL) == -1)
522 			fatal("pledge");
523 #endif
524 
525 		/*
526 		 * Set up default "flat 32 bit" register state - RIP,
527 		 * RSP, and GDT info will be set in bootloader
528 	 	 */
529 		memcpy(&vrs, &vcpu_init_flat32, sizeof(struct vcpu_reg_state));
530 
531 		/* Load kernel image */
532 		ret = loadelf_main(vm->vm_kernel, vcp, &vrs);
533 		if (ret) {
534 			errno = ret;
535 			fatal("failed to load kernel - exiting");
536 		}
537 
538 		close(vm->vm_kernel);
539 
540 		con_fd = vm->vm_tty;
541 		if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
542 			fatal("failed to set nonblocking mode on console");
543 
544 		/* Execute the vcpu run loop(s) for this VM */
545 		ret = run_vm(vm->vm_disks, vm->vm_ifs, vcp, &vrs);
546 
547 		_exit(ret != 0);
548 	}
549 
550 	return (0);
551 
552  err:
553 	vm_remove(vm);
554 
555 	return (ret);
556 }
557 
558 /*
559  * get_info_vm
560  *
561  * Returns a list of VMs known to vmm(4).
562  *
563  * Parameters:
564  *  ps: the privsep context.
565  *  imsg: the received imsg including the peer id.
566  *  terminate: terminate the listed vm.
567  *
568  * Return values:
569  *  0: success
570  *  !0 : failure (eg, ENOMEM, EIO or another error code from vmm(4) ioctl)
571  */
572 int
573 get_info_vm(struct privsep *ps, struct imsg *imsg, int terminate)
574 {
575 	int ret;
576 	size_t ct, i;
577 	struct vm_info_params vip;
578 	struct vm_info_result *info;
579 	struct vm_terminate_params vtp;
580 	struct vmop_info_result vir;
581 
582 	/*
583 	 * We issue the VMM_IOC_INFO ioctl twice, once with an input
584 	 * buffer size of 0, which results in vmm(4) returning the
585 	 * number of bytes required back to us in vip.vip_size,
586 	 * and then we call it again after malloc'ing the required
587 	 * number of bytes.
588 	 *
589 	 * It is possible that we could fail a second time (eg, if
590 	 * another VM was created in the instant between the two
591 	 * ioctls, but in that case the caller can just try again
592 	 * as vmm(4) will return a zero-sized list in that case.
593 	 */
594 	vip.vip_size = 0;
595 	info = NULL;
596 	ret = 0;
597 	memset(&vir, 0, sizeof(vir));
598 
599 	/* First ioctl to see how many bytes needed (vip.vip_size) */
600 	if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) < 0)
601 		return (errno);
602 
603 	if (vip.vip_info_ct != 0)
604 		return (EIO);
605 
606 	info = malloc(vip.vip_size);
607 	if (info == NULL)
608 		return (ENOMEM);
609 
610 	/* Second ioctl to get the actual list */
611 	vip.vip_info = info;
612 	if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) < 0) {
613 		ret = errno;
614 		free(info);
615 		return (ret);
616 	}
617 
618 	/* Return info */
619 	ct = vip.vip_size / sizeof(struct vm_info_result);
620 	for (i = 0; i < ct; i++) {
621 		if (terminate) {
622 			vtp.vtp_vm_id = info[i].vir_id;
623 			if ((ret = terminate_vm(&vtp)) != 0)
624 				return (ret);
625 			log_debug("%s: terminated VM %s (id %d)", __func__,
626 			    info[i].vir_name, info[i].vir_id);
627 			continue;
628 		}
629 		memcpy(&vir.vir_info, &info[i], sizeof(vir.vir_info));
630 		if (proc_compose_imsg(ps, PROC_PARENT, -1,
631 		    IMSG_VMDOP_GET_INFO_VM_DATA, imsg->hdr.peerid, -1,
632 		    &vir, sizeof(vir)) == -1)
633 			return (EIO);
634 	}
635 	free(info);
636 	return (0);
637 }
638 
639 
640 /*
641  * start_client_vmd
642  *
643  * forks a copy of the parent vmd, chroots to VMD_USER's home, drops
644  * privileges (changes to user VMD_USER), and returns.
645  * Should the fork operation succeed, but later chroot/privsep
646  * fail, the child exits.
647  *
648  * Return values (returns to both child and parent on success):
649  *  -1 : failure
650  *  0: return to child vmd returns 0
651  *  !0 : return to parent vmd returns the child's pid
652  */
653 int
654 start_client_vmd(void)
655 {
656 	int child_pid;
657 
658 	child_pid = fork();
659 	if (child_pid < 0)
660 		return (-1);
661 
662 	if (!child_pid) {
663 		/* child, already running without privileges */
664 		return (0);
665 	}
666 
667 	/* Parent */
668 	return (child_pid);
669 }
670 
671 /*
672  * create_memory_map
673  *
674  * Sets up the guest physical memory ranges that the VM can access.
675  *
676  * Return values:
677  *  nothing
678  */
679 void
680 create_memory_map(struct vm_create_params *vcp)
681 {
682 	size_t len, mem_bytes, mem_mb;
683 
684 	mem_mb = vcp->vcp_memranges[0].vmr_size;
685 	vcp->vcp_nmemranges = 0;
686 	if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE)
687 		return;
688 
689 	mem_bytes = mem_mb * 1024 * 1024;
690 
691 	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
692 	len = LOWMEM_KB * 1024;
693 	vcp->vcp_memranges[0].vmr_gpa = 0x0;
694 	vcp->vcp_memranges[0].vmr_size = len;
695 	mem_bytes -= len;
696 
697 	/*
698 	 * Second memory region: LOWMEM_KB - 1MB.
699 	 *
700 	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
701 	 * We have to add this region, because some systems
702 	 * unconditionally write to 0xb8000 (VGA RAM), and
703 	 * we need to make sure that vmm(4) permits accesses
704 	 * to it. So allocate guest memory for it.
705 	 */
706 	len = 0x100000 - LOWMEM_KB * 1024;
707 	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
708 	vcp->vcp_memranges[1].vmr_size = len;
709 	mem_bytes -= len;
710 
711 	/* Make sure that we do not place physical memory into MMIO ranges. */
712 	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000)
713 		len = VMM_PCI_MMIO_BAR_BASE - 0x100000;
714 	else
715 		len = mem_bytes;
716 
717 	/* Third memory region: 1MB - (1MB + len) */
718 	vcp->vcp_memranges[2].vmr_gpa = 0x100000;
719 	vcp->vcp_memranges[2].vmr_size = len;
720 	mem_bytes -= len;
721 
722 	if (mem_bytes > 0) {
723 		/* Fourth memory region for the remaining memory (if any) */
724 		vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
725 		vcp->vcp_memranges[3].vmr_size = mem_bytes;
726 		vcp->vcp_nmemranges = 4;
727 	} else
728 		vcp->vcp_nmemranges = 3;
729 }
730 
731 /*
732  * alloc_guest_mem
733  *
734  * Allocates memory for the guest.
735  * Instead of doing a single allocation with one mmap(), we allocate memory
736  * separately for every range for the following reasons:
737  * - ASLR for the individual ranges
738  * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
739  *   map the single mmap'd userspace memory to the individual guest physical
740  *   memory ranges, the underlying amap of the single mmap'd range would have
741  *   to allocate per-page reference counters. The reason is that the
742  *   individual guest physical ranges would reference the single mmap'd region
743  *   only partially. However, if every guest physical range has its own
744  *   corresponding mmap'd userspace allocation, there are no partial
745  *   references: every guest physical range fully references an mmap'd
746  *   range => no per-page reference counters have to be allocated.
747  *
748  * Return values:
749  *  0: success
750  *  !0: failure - errno indicating the source of the failure
751  */
752 int
753 alloc_guest_mem(struct vm_create_params *vcp)
754 {
755 	void *p;
756 	int ret;
757 	size_t i, j;
758 	struct vm_mem_range *vmr;
759 
760 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
761 		vmr = &vcp->vcp_memranges[i];
762 		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
763 		    MAP_PRIVATE | MAP_ANON, -1, 0);
764 		if (p == MAP_FAILED) {
765 			ret = errno;
766 			for (j = 0; j < i; j++) {
767 				vmr = &vcp->vcp_memranges[j];
768 				munmap((void *)vmr->vmr_va, vmr->vmr_size);
769 			}
770 
771 			return (ret);
772 		}
773 
774 		vmr->vmr_va = (vaddr_t)p;
775 	}
776 
777 	return (0);
778 }
779 
780 /*
781  * vmm_create_vm
782  *
783  * Requests vmm(4) to create a new VM using the supplied creation
784  * parameters. This operation results in the creation of the in-kernel
785  * structures for the VM, but does not start the VM's vcpu(s).
786  *
787  * Parameters:
788  *  vcp: vm_create_params struct containing the VM's desired creation
789  *      configuration
790  *
791  * Return values:
792  *  0: success
793  *  !0 : ioctl to vmm(4) failed
794  */
795 int
796 vmm_create_vm(struct vm_create_params *vcp)
797 {
798 	/* Sanity check arguments */
799 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
800 		return (EINVAL);
801 
802 	if (vcp->vcp_nmemranges == 0 ||
803 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
804 		return (EINVAL);
805 
806 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
807 		return (EINVAL);
808 
809 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
810 		return (EINVAL);
811 
812 	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) < 0)
813 		return (errno);
814 
815 	return (0);
816 }
817 
818 /*
819  * init_emulated_hw
820  *
821  * Initializes the userspace hardware emulation
822  */
823 void
824 init_emulated_hw(struct vm_create_params *vcp, int *child_disks,
825     int *child_taps)
826 {
827 	int i;
828 
829 	/* Reset the IO port map */
830 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
831 
832 	/* Init i8253 PIT */
833 	i8253_init(vcp->vcp_id);
834 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
835 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
836 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
837 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
838 
839 	/* Init mc146818 RTC */
840 	mc146818_init(vcp->vcp_id);
841 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
842 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
843 
844 	/* Init master and slave PICs */
845 	i8259_init();
846 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
847 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
848 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
849 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
850 
851 	/* Init ns8250 UART */
852 	ns8250_init(con_fd, vcp->vcp_id);
853 	for (i = COM1_DATA; i <= COM1_SCR; i++)
854 		ioports_map[i] = vcpu_exit_com;
855 
856 	/* Initialize PCI */
857 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
858 		ioports_map[i] = vcpu_exit_pci;
859 
860 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
861 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
862 	pci_init();
863 
864 	/* Initialize virtio devices */
865 	virtio_init(vcp, child_disks, child_taps);
866 }
867 
868 /*
869  * run_vm
870  *
871  * Runs the VM whose creation parameters are specified in vcp
872  *
873  * Parameters:
874  *  child_disks: previously-opened child VM disk file file descriptors
875  *  child_taps: previously-opened child tap file descriptors
876  *  vcp: vm_create_params struct containing the VM's desired creation
877  *      configuration
878  *  vrs: VCPU register state to initialize
879  *
880  * Return values:
881  *  0: the VM exited normally
882  *  !0 : the VM exited abnormally or failed to start
883  */
884 int
885 run_vm(int *child_disks, int *child_taps, struct vm_create_params *vcp,
886     struct vcpu_reg_state *vrs)
887 {
888 	uint8_t evdone = 0;
889 	size_t i;
890 	int ret;
891 	pthread_t *tid, evtid;
892 	struct vm_run_params **vrp;
893 	void *exit_status;
894 
895 	if (vcp == NULL)
896 		return (EINVAL);
897 
898 	if (child_disks == NULL && vcp->vcp_ndisks != 0)
899 		return (EINVAL);
900 
901 	if (child_taps == NULL && vcp->vcp_nnics != 0)
902 		return (EINVAL);
903 
904 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
905 		return (EINVAL);
906 
907 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
908 		return (EINVAL);
909 
910 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
911 		return (EINVAL);
912 
913 	if (vcp->vcp_nmemranges == 0 ||
914 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
915 		return (EINVAL);
916 
917 	event_init();
918 
919 	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
920 	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
921 	if (tid == NULL || vrp == NULL) {
922 		log_warn("%s: memory allocation error - exiting.",
923 		    __progname);
924 		return (ENOMEM);
925 	}
926 
927 	log_debug("%s: initializing hardware for vm %s", __func__,
928 	    vcp->vcp_name);
929 
930 	init_emulated_hw(vcp, child_disks, child_taps);
931 
932 	ret = pthread_mutex_init(&threadmutex, NULL);
933 	if (ret) {
934 		log_warn("%s: could not initialize thread state mutex",
935 		    __func__);
936 		return (ret);
937 	}
938 	ret = pthread_cond_init(&threadcond, NULL);
939 	if (ret) {
940 		log_warn("%s: could not initialize thread state "
941 		    "condition variable", __func__);
942 		return (ret);
943 	}
944 
945 	mutex_lock(&threadmutex);
946 
947 	log_debug("%s: starting vcpu threads for vm %s", __func__,
948 	    vcp->vcp_name);
949 
950 	/*
951 	 * Create and launch one thread for each VCPU. These threads may
952 	 * migrate between PCPUs over time; the need to reload CPU state
953 	 * in such situations is detected and performed by vmm(4) in the
954 	 * kernel.
955 	 */
956 	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
957 		vrp[i] = malloc(sizeof(struct vm_run_params));
958 		if (vrp[i] == NULL) {
959 			log_warn("%s: memory allocation error - "
960 			    "exiting.", __progname);
961 			/* caller will exit, so skip free'ing */
962 			return (ENOMEM);
963 		}
964 		vrp[i]->vrp_exit = malloc(sizeof(union vm_exit));
965 		if (vrp[i]->vrp_exit == NULL) {
966 			log_warn("%s: memory allocation error - "
967 			    "exiting.", __progname);
968 			/* caller will exit, so skip free'ing */
969 			return (ENOMEM);
970 		}
971 		vrp[i]->vrp_vm_id = vcp->vcp_id;
972 		vrp[i]->vrp_vcpu_id = i;
973 
974 		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
975 			log_warnx("%s: cannot reset VCPU %zu - exiting.",
976 			    __progname, i);
977 			return (EIO);
978 		}
979 
980 		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
981 		if (ret) {
982 			log_warnx("%s: cannot initialize cond var (%d)",
983 			    __progname, ret);
984 			return (ret);
985 		}
986 
987 		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
988 		if (ret) {
989 			log_warnx("%s: cannot initialize mtx (%d)",
990 			    __progname, ret);
991 			return (ret);
992 		}
993 
994 		vcpu_hlt[i] = 0;
995 
996 		/* Start each VCPU run thread at vcpu_run_loop */
997 		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
998 		if (ret) {
999 			/* caller will _exit after this return */
1000 			ret = errno;
1001 			log_warn("%s: could not create vcpu thread %zu",
1002 			    __func__, i);
1003 			return (ret);
1004 		}
1005 	}
1006 
1007 	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
1008 	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
1009 	if (ret) {
1010 		errno = ret;
1011 		log_warn("%s: could not create event thread", __func__);
1012 		return (ret);
1013 	}
1014 
1015 	for (;;) {
1016 		ret = pthread_cond_wait(&threadcond, &threadmutex);
1017 		if (ret) {
1018 			log_warn("%s: waiting on thread state condition "
1019 			    "variable failed", __func__);
1020 			return (ret);
1021 		}
1022 
1023 		/*
1024 		 * Did a VCPU thread exit with an error? => return the first one
1025 		 */
1026 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1027 			if (vcpu_done[i] == 0)
1028 				continue;
1029 
1030 			if (pthread_join(tid[i], &exit_status)) {
1031 				log_warn("%s: failed to join thread %zd - "
1032 				    "exiting", __progname, i);
1033 				return (EIO);
1034 			}
1035 
1036 			if (exit_status != NULL) {
1037 				log_warnx("%s: vm %d vcpu run thread %zd "
1038 				    "exited abnormally", __progname,
1039 				    vcp->vcp_id, i);
1040 				return (EIO);
1041 			}
1042 		}
1043 
1044 		/* Did the event thread exit? => return with an error */
1045 		if (evdone) {
1046 			if (pthread_join(evtid, &exit_status)) {
1047 				log_warn("%s: failed to join event thread - "
1048 				    "exiting", __progname);
1049 				return (EIO);
1050 			}
1051 
1052 			log_warnx("%s: vm %d event thread exited "
1053 			    "unexpectedly", __progname, vcp->vcp_id);
1054 			return (EIO);
1055 		}
1056 
1057 		/* Did all VCPU threads exit successfully? => return 0 */
1058 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1059 			if (vcpu_done[i] == 0)
1060 				break;
1061 		}
1062 		if (i == vcp->vcp_ncpus)
1063 			return (0);
1064 
1065 		/* Some more threads to wait for, start over */
1066 
1067 	}
1068 
1069 	return (0);
1070 }
1071 
1072 void *
1073 event_thread(void *arg)
1074 {
1075 	uint8_t *donep = arg;
1076 	intptr_t ret;
1077 
1078 	ret = event_dispatch();
1079 
1080 	mutex_lock(&threadmutex);
1081 	*donep = 1;
1082 	pthread_cond_signal(&threadcond);
1083 	mutex_unlock(&threadmutex);
1084 
1085 	return (void *)ret;
1086  }
1087 
1088 /*
1089  * vcpu_run_loop
1090  *
1091  * Runs a single VCPU until vmm(4) requires help handling an exit,
1092  * or the VM terminates.
1093  *
1094  * Parameters:
1095  *  arg: vcpu_run_params for the VCPU being run by this thread
1096  *
1097  * Return values:
1098  *  NULL: the VCPU shutdown properly
1099  *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1100  */
1101 void *
1102 vcpu_run_loop(void *arg)
1103 {
1104 	struct vm_run_params *vrp = (struct vm_run_params *)arg;
1105 	intptr_t ret = 0;
1106 	int irq;
1107 	uint32_t n;
1108 
1109 	vrp->vrp_continue = 0;
1110 	n = vrp->vrp_vcpu_id;
1111 
1112 	for (;;) {
1113 		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1114 
1115 		if (ret) {
1116 			log_warnx("%s: can't lock vcpu run mtx (%d)",
1117 			    __func__, (int)ret);
1118 			return ((void *)ret);
1119 		}
1120 
1121 		/* If we are halted, wait */
1122 		if (vcpu_hlt[n]) {
1123 			ret = pthread_cond_wait(&vcpu_run_cond[n],
1124 			    &vcpu_run_mtx[n]);
1125 
1126 			if (ret) {
1127 				log_warnx("%s: can't wait on cond (%d)",
1128 				    __func__, (int)ret);
1129 				(void)pthread_mutex_unlock(&vcpu_run_mtx[n]);
1130 				break;
1131 			}
1132 		}
1133 
1134 		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1135 		if (ret) {
1136 			log_warnx("%s: can't unlock mutex on cond (%d)",
1137 			    __func__, (int)ret);
1138 			break;
1139 		}
1140 
1141 		if (vrp->vrp_irqready && i8259_is_pending()) {
1142 			irq = i8259_ack();
1143 			vrp->vrp_irq = irq;
1144 		} else
1145 			vrp->vrp_irq = 0xFFFF;
1146 
1147 		/* Still more pending? */
1148 		if (i8259_is_pending()) {
1149 			/* XXX can probably avoid ioctls here by providing intr in vrp */
1150 			if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 1)) {
1151 				fatal("can't set INTR");
1152 			}
1153 		} else {
1154 			if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 0)) {
1155 				fatal("can't clear INTR");
1156 			}
1157 		}
1158 
1159 		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) < 0) {
1160 			/* If run ioctl failed, exit */
1161 			ret = errno;
1162 			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1163 			    __func__, vrp->vrp_vm_id, n);
1164 			break;
1165 		}
1166 
1167 		/* If the VM is terminating, exit normally */
1168 		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
1169 			ret = (intptr_t)NULL;
1170 			break;
1171 		}
1172 
1173 		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
1174 			/*
1175 			 * vmm(4) needs help handling an exit, handle in
1176 			 * vcpu_exit.
1177 			 */
1178 			if (vcpu_exit(vrp)) {
1179 				ret = EIO;
1180 				break;
1181 			}
1182 		}
1183 	}
1184 
1185 	mutex_lock(&threadmutex);
1186 	vcpu_done[n] = 1;
1187 	pthread_cond_signal(&threadcond);
1188 	mutex_unlock(&threadmutex);
1189 
1190 	return ((void *)ret);
1191 }
1192 
1193 int
1194 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1195 {
1196 	struct vm_intr_params vip;
1197 
1198 	memset(&vip, 0, sizeof(vip));
1199 
1200 	vip.vip_vm_id = vm_id;
1201 	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1202 	vip.vip_intr = intr;
1203 
1204 	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) < 0)
1205 		return (errno);
1206 
1207 	return (0);
1208 }
1209 
1210 /*
1211  * vcpu_exit_pci
1212  *
1213  * Handle all I/O to the emulated PCI subsystem.
1214  *
1215  * Parameters:
1216  *  vrp: vcpu run paramters containing guest state for this exit
1217  *
1218  * Return value:
1219  *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
1220  *      be injected.
1221  */
1222 uint8_t
1223 vcpu_exit_pci(struct vm_run_params *vrp)
1224 {
1225 	union vm_exit *vei = vrp->vrp_exit;
1226 	uint8_t intr;
1227 
1228 	intr = 0xFF;
1229 
1230 	switch (vei->vei.vei_port) {
1231 	case PCI_MODE1_ADDRESS_REG:
1232 		pci_handle_address_reg(vrp);
1233 		break;
1234 	case PCI_MODE1_DATA_REG:
1235 		pci_handle_data_reg(vrp);
1236 		break;
1237 	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
1238 		intr = pci_handle_io(vrp);
1239 		break;
1240 	default:
1241 		log_warnx("%s: unknown PCI register 0x%llx",
1242 		    __progname, (uint64_t)vei->vei.vei_port);
1243 		break;
1244 	}
1245 
1246 	return (intr);
1247 }
1248 
1249 /*
1250  * vcpu_exit_inout
1251  *
1252  * Handle all I/O exits that need to be emulated in vmd. This includes the
1253  * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
1254  *
1255  * Parameters:
1256  *  vrp: vcpu run parameters containing guest state for this exit
1257  */
1258 void
1259 vcpu_exit_inout(struct vm_run_params *vrp)
1260 {
1261 	union vm_exit *vei = vrp->vrp_exit;
1262 	uint8_t intr = 0xFF;
1263 
1264 	if (ioports_map[vei->vei.vei_port] != NULL)
1265 		intr = ioports_map[vei->vei.vei_port](vrp);
1266 	else if (vei->vei.vei_dir == VEI_DIR_IN)
1267 			vei->vei.vei_data = 0xFFFFFFFF;
1268 
1269 	if (intr != 0xFF)
1270 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
1271 }
1272 
1273 /*
1274  * vcpu_exit
1275  *
1276  * Handle a vcpu exit. This function is called when it is determined that
1277  * vmm(4) requires the assistance of vmd to support a particular guest
1278  * exit type (eg, accessing an I/O port or device). Guest state is contained
1279  * in 'vrp', and will be resent to vmm(4) on exit completion.
1280  *
1281  * Upon conclusion of handling the exit, the function determines if any
1282  * interrupts should be injected into the guest, and asserts the proper
1283  * IRQ line whose interrupt should be vectored.
1284  *
1285  * Parameters:
1286  *  vrp: vcpu run parameters containing guest state for this exit
1287  *
1288  * Return values:
1289  *  0: the exit was handled successfully
1290  *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
1291  */
1292 int
1293 vcpu_exit(struct vm_run_params *vrp)
1294 {
1295 	int ret;
1296 
1297 	switch (vrp->vrp_exit_reason) {
1298 	case VMX_EXIT_IO:
1299 		vcpu_exit_inout(vrp);
1300 		break;
1301 	case VMX_EXIT_HLT:
1302 		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1303 		if (ret) {
1304 			log_warnx("%s: can't lock vcpu mutex (%d)",
1305 			    __func__, ret);
1306 			return (1);
1307 		}
1308 		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
1309 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1310 		if (ret) {
1311 			log_warnx("%s: can't unlock vcpu mutex (%d)",
1312 			    __func__, ret);
1313 			return (1);
1314 		}
1315 		break;
1316 	case VMX_EXIT_INT_WINDOW:
1317 		break;
1318 	case VMX_EXIT_TRIPLE_FAULT:
1319 		log_warnx("%s: triple fault", __progname);
1320 		return (1);
1321 	default:
1322 		log_debug("%s: unknown exit reason %d",
1323 		    __progname, vrp->vrp_exit_reason);
1324 	}
1325 
1326 	/* XXX this may not be irq 9 all the time */
1327 	if (vionet_process_rx())
1328 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 9);
1329 
1330 	vrp->vrp_continue = 1;
1331 
1332 	return (0);
1333 }
1334 
1335 /*
1336  * find_gpa_range
1337  *
1338  * Search for a contiguous guest physical mem range.
1339  *
1340  * Parameters:
1341  *  vcp: VM create parameters that contain the memory map to search in
1342  *  gpa: the starting guest physical address
1343  *  len: the length of the memory range
1344  *
1345  * Return values:
1346  *  NULL: on failure if there is no memory range as described by the parameters
1347  *  Pointer to vm_mem_range that contains the start of the range otherwise.
1348  */
1349 static struct vm_mem_range *
1350 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
1351 {
1352 	size_t i, n;
1353 	struct vm_mem_range *vmr;
1354 
1355 	/* Find the first vm_mem_range that contains gpa */
1356 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1357 		vmr = &vcp->vcp_memranges[i];
1358 		if (vmr->vmr_gpa + vmr->vmr_size >= gpa)
1359 			break;
1360 	}
1361 
1362 	/* No range found. */
1363 	if (i == vcp->vcp_nmemranges)
1364 		return (NULL);
1365 
1366 	/*
1367 	 * vmr may cover the range [gpa, gpa + len) only partly. Make
1368 	 * sure that the following vm_mem_ranges are contiguous and
1369 	 * cover the rest.
1370 	 */
1371 	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
1372 	if (len < n)
1373 		len = 0;
1374 	else
1375 		len -= n;
1376 	gpa = vmr->vmr_gpa + vmr->vmr_size;
1377 	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
1378 		vmr = &vcp->vcp_memranges[i];
1379 		if (gpa != vmr->vmr_gpa)
1380 			return (NULL);
1381 		if (len <= vmr->vmr_size)
1382 			len = 0;
1383 		else
1384 			len -= vmr->vmr_size;
1385 
1386 		gpa = vmr->vmr_gpa + vmr->vmr_size;
1387 	}
1388 
1389 	if (len != 0)
1390 		return (NULL);
1391 
1392 	return (vmr);
1393 }
1394 
1395 /*
1396  * write_mem
1397  *
1398  * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
1399  *
1400  * Parameters:
1401  *  dst: the destination paddr_t in the guest VM
1402  *  buf: data to copy
1403  *  len: number of bytes to copy
1404  *
1405  * Return values:
1406  *  0: success
1407  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1408  *      exist in the guest.
1409  */
1410 int
1411 write_mem(paddr_t dst, void *buf, size_t len)
1412 {
1413 	char *from = buf, *to;
1414 	size_t n, off;
1415 	struct vm_mem_range *vmr;
1416 
1417 	vmr = find_gpa_range(&current_vm->vm_params, dst, len);
1418 	if (vmr == NULL) {
1419 		errno = EINVAL;
1420 		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
1421 		    "len = 0x%zx", __func__, dst, len);
1422 		return (EINVAL);
1423 	}
1424 
1425 	off = dst - vmr->vmr_gpa;
1426 	while (len != 0) {
1427 		n = vmr->vmr_size - off;
1428 		if (len < n)
1429 			n = len;
1430 
1431 		to = (char *)vmr->vmr_va + off;
1432 		memcpy(to, from, n);
1433 
1434 		from += n;
1435 		len -= n;
1436 		off = 0;
1437 		vmr++;
1438 	}
1439 
1440 	return (0);
1441 }
1442 
1443 /*
1444  * read_mem
1445  *
1446  * Reads memory at guest paddr 'src' into 'buf'.
1447  *
1448  * Parameters:
1449  *  src: the source paddr_t in the guest VM to read from.
1450  *  buf: destination (local) buffer
1451  *  len: number of bytes to read
1452  *
1453  * Return values:
1454  *  0: success
1455  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1456  *      exist in the guest.
1457  */
1458 int
1459 read_mem(paddr_t src, void *buf, size_t len)
1460 {
1461 	char *from, *to = buf;
1462 	size_t n, off;
1463 	struct vm_mem_range *vmr;
1464 
1465 	vmr = find_gpa_range(&current_vm->vm_params, src, len);
1466 	if (vmr == NULL) {
1467 		errno = EINVAL;
1468 		log_warn("%s: failed - invalid memory range src = 0x%lx, "
1469 		    "len = 0x%zx", __func__, src, len);
1470 		return (EINVAL);
1471 	}
1472 
1473 	off = src - vmr->vmr_gpa;
1474 	while (len != 0) {
1475 		n = vmr->vmr_size - off;
1476 		if (len < n)
1477 			n = len;
1478 
1479 		from = (char *)vmr->vmr_va + off;
1480 		memcpy(to, from, n);
1481 
1482 		to += n;
1483 		len -= n;
1484 		off = 0;
1485 		vmr++;
1486 	}
1487 
1488 	return (0);
1489 }
1490 
1491 /*
1492  * vcpu_assert_pic_irq
1493  *
1494  * Injects the specified IRQ on the supplied vcpu/vm
1495  *
1496  * Parameters:
1497  *  vm_id: VM ID to inject to
1498  *  vcpu_id: VCPU ID to inject to
1499  *  irq: IRQ to inject
1500  */
1501 void
1502 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1503 {
1504 	int ret;
1505 
1506 	i8259_assert_irq(irq);
1507 
1508 	if (i8259_is_pending()) {
1509 		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
1510 			fatalx("%s: can't assert INTR", __func__);
1511 
1512 		ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
1513 		if (ret)
1514 			fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
1515 
1516 		vcpu_hlt[vcpu_id] = 0;
1517 		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
1518 		if (ret)
1519 			fatalx("%s: can't signal (%d)", __func__, ret);
1520 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
1521 		if (ret)
1522 			fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
1523 	}
1524 }
1525 
1526 /*
1527  * fd_hasdata
1528  *
1529  * Determines if data can be read from a file descriptor.
1530  *
1531  * Parameters:
1532  *  fd: the fd to check
1533  *
1534  * Return values:
1535  *  1 if data can be read from an fd, or 0 otherwise.
1536  */
1537 int
1538 fd_hasdata(int fd)
1539 {
1540 	struct pollfd pfd[1];
1541 	int nready, hasdata = 0;
1542 
1543 	pfd[0].fd = fd;
1544 	pfd[0].events = POLLIN;
1545 	nready = poll(pfd, 1, 0);
1546 	if (nready == -1)
1547 		log_warn("checking file descriptor for data failed");
1548 	else if (nready == 1 && pfd[0].revents & POLLIN)
1549 		hasdata = 1;
1550 	return (hasdata);
1551 }
1552 
1553 /*
1554  * mutex_lock
1555  *
1556  * Wrapper function for pthread_mutex_lock that does error checking and that
1557  * exits on failure
1558  */
1559 void
1560 mutex_lock(pthread_mutex_t *m)
1561 {
1562 	int ret;
1563 
1564 	ret = pthread_mutex_lock(m);
1565 	if (ret) {
1566 		errno = ret;
1567 		fatal("could not acquire mutex");
1568 	}
1569 }
1570 
1571 /*
1572  * mutex_unlock
1573  *
1574  * Wrapper function for pthread_mutex_unlock that does error checking and that
1575  * exits on failure
1576  */
1577 void
1578 mutex_unlock(pthread_mutex_t *m)
1579 {
1580 	int ret;
1581 
1582 	ret = pthread_mutex_unlock(m);
1583 	if (ret) {
1584 		errno = ret;
1585 		fatal("could not release mutex");
1586 	}
1587 }
1588