xref: /openbsd-src/usr.sbin/vmd/vm.c (revision 4b70baf6e17fc8b27fc1f7fa7929335753fa94c3)
1 /*	$OpenBSD: vm.c,v 1.45 2019/03/01 07:32:29 mlarkin Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/ioctl.h>
21 #include <sys/queue.h>
22 #include <sys/wait.h>
23 #include <sys/uio.h>
24 #include <sys/socket.h>
25 #include <sys/time.h>
26 #include <sys/mman.h>
27 
28 #include <dev/ic/i8253reg.h>
29 #include <dev/isa/isareg.h>
30 #include <dev/pci/pcireg.h>
31 
32 #include <machine/param.h>
33 #include <machine/psl.h>
34 #include <machine/pte.h>
35 #include <machine/specialreg.h>
36 #include <machine/vmmvar.h>
37 
38 #include <net/if.h>
39 
40 #include <errno.h>
41 #include <event.h>
42 #include <fcntl.h>
43 #include <imsg.h>
44 #include <limits.h>
45 #include <poll.h>
46 #include <pthread.h>
47 #include <stddef.h>
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include <unistd.h>
52 #include <util.h>
53 
54 #include "vmd.h"
55 #include "vmm.h"
56 #include "loadfile.h"
57 #include "pci.h"
58 #include "virtio.h"
59 #include "proc.h"
60 #include "i8253.h"
61 #include "i8259.h"
62 #include "ns8250.h"
63 #include "mc146818.h"
64 #include "fw_cfg.h"
65 #include "atomicio.h"
66 
67 io_fn_t ioports_map[MAX_PORTS];
68 
69 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
70     struct vmop_create_params *, struct vcpu_reg_state *);
71 void vm_dispatch_vmm(int, short, void *);
72 void *event_thread(void *);
73 void *vcpu_run_loop(void *);
74 int vcpu_exit(struct vm_run_params *);
75 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
76 void create_memory_map(struct vm_create_params *);
77 int alloc_guest_mem(struct vm_create_params *);
78 int vmm_create_vm(struct vm_create_params *);
79 void init_emulated_hw(struct vmop_create_params *, int,
80     int[][VM_MAX_BASE_PER_DISK], int *);
81 void restore_emulated_hw(struct vm_create_params *, int, int *,
82     int[][VM_MAX_BASE_PER_DISK],int);
83 void vcpu_exit_inout(struct vm_run_params *);
84 uint8_t vcpu_exit_pci(struct vm_run_params *);
85 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
86 int loadfile_bios(FILE *, struct vcpu_reg_state *);
87 int send_vm(int, struct vm_create_params *);
88 int dump_send_header(int);
89 int dump_vmr(int , struct vm_mem_range *);
90 int dump_mem(int, struct vm_create_params *);
91 void restore_vmr(int, struct vm_mem_range *);
92 void restore_mem(int, struct vm_create_params *);
93 void pause_vm(struct vm_create_params *);
94 void unpause_vm(struct vm_create_params *);
95 
96 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
97     size_t);
98 
99 int con_fd;
100 struct vmd_vm *current_vm;
101 
102 extern struct vmd *env;
103 
104 extern char *__progname;
105 
106 pthread_mutex_t threadmutex;
107 pthread_cond_t threadcond;
108 
109 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
110 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
111 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
112 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
113 
114 /*
115  * Represents a standard register set for an OS to be booted
116  * as a flat 64 bit address space.
117  *
118  * NOT set here are:
119  *  RIP
120  *  RSP
121  *  GDTR BASE
122  *
123  * Specific bootloaders should clone this structure and override
124  * those fields as needed.
125  *
126  * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
127  *        features of the CPU in use.
128  */
129 static const struct vcpu_reg_state vcpu_init_flat64 = {
130 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
131 	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
132 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
133 	.vrs_crs[VCPU_REGS_CR0] = CR0_CD | CR0_NW | CR0_ET | CR0_PE | CR0_PG,
134 	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
135 	.vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
136 	.vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
137 	.vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
138 	.vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
139 	.vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
140 	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
141 	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
142 	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
143 	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
144 	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
145 	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
146 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
147 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
148 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
149 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
150 	.vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
151 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
152 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
153 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
154 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
155 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
156 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
157 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
158 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
159 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
160 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
161 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
162 	.vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
163 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
164 };
165 
166 /*
167  * Represents a standard register set for an BIOS to be booted
168  * as a flat 16 bit address space.
169  */
170 static const struct vcpu_reg_state vcpu_init_flat16 = {
171 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
172 	.vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
173 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
174 	.vrs_crs[VCPU_REGS_CR0] = 0x60000010,
175 	.vrs_crs[VCPU_REGS_CR3] = 0,
176 	.vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
177 	.vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
178 	.vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
179 	.vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
180 	.vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
181 	.vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
182 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
183 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
184 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
185 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
186 	.vrs_msrs[VCPU_REGS_EFER] = 0ULL,
187 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
188 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
189 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
190 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
191 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
192 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
193 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
194 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
195 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
196 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
197 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
198 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
199 };
200 
201 /*
202  * loadfile_bios
203  *
204  * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
205  * directly into memory.
206  *
207  * Parameters:
208  *  fp: file of a kernel file to load
209  *  (out) vrs: register state to set on init for this kernel
210  *
211  * Return values:
212  *  0 if successful
213  *  various error codes returned from read(2) or loadelf functions
214  */
215 int
216 loadfile_bios(FILE *fp, struct vcpu_reg_state *vrs)
217 {
218 	off_t	 size, off;
219 
220 	/* Set up a "flat 16 bit" register state for BIOS */
221 	memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
222 
223 	/* Get the size of the BIOS image and seek to the beginning */
224 	if (fseeko(fp, 0, SEEK_END) == -1 || (size = ftello(fp)) == -1 ||
225 	    fseeko(fp, 0, SEEK_SET) == -1)
226 		return (-1);
227 
228 	/* The BIOS image must end at 1M */
229 	if ((off = 1048576 - size) < 0)
230 		return (-1);
231 
232 	/* Read BIOS image into memory */
233 	if (mread(fp, off, size) != (size_t)size) {
234 		errno = EIO;
235 		return (-1);
236 	}
237 
238 	log_debug("%s: loaded BIOS image", __func__);
239 
240 	return (0);
241 }
242 
243 /*
244  * start_vm
245  *
246  * After forking a new VM process, starts the new VM with the creation
247  * parameters supplied (in the incoming vm->vm_params field). This
248  * function performs a basic sanity check on the incoming parameters
249  * and then performs the following steps to complete the creation of the VM:
250  *
251  * 1. validates and create the new VM
252  * 2. opens the imsg control channel to the parent and drops more privilege
253  * 3. drops additional privleges by calling pledge(2)
254  * 4. loads the kernel from the disk image or file descriptor
255  * 5. runs the VM's VCPU loops.
256  *
257  * Parameters:
258  *  vm: The VM data structure that is including the VM create parameters.
259  *  fd: The imsg socket that is connected to the parent process.
260  *
261  * Return values:
262  *  0: success
263  *  !0 : failure - typically an errno indicating the source of the failure
264  */
265 int
266 start_vm(struct vmd_vm *vm, int fd)
267 {
268 	struct vmop_create_params *vmc = &vm->vm_params;
269 	struct vm_create_params	*vcp = &vmc->vmc_params;
270 	struct vcpu_reg_state	 vrs;
271 	int			 nicfds[VMM_MAX_NICS_PER_VM];
272 	int			 ret;
273 	FILE			*fp;
274 	struct vmboot_params	 vmboot;
275 	size_t			 i;
276 	struct vm_rwregs_params  vrp;
277 
278 	/* Child */
279 	setproctitle("%s", vcp->vcp_name);
280 	log_procinit(vcp->vcp_name);
281 
282 	if (!vm->vm_received)
283 		create_memory_map(vcp);
284 
285 	ret = alloc_guest_mem(vcp);
286 
287 	if (ret) {
288 		errno = ret;
289 		fatal("could not allocate guest memory - exiting");
290 	}
291 
292 	ret = vmm_create_vm(vcp);
293 	current_vm = vm;
294 
295 	/* send back the kernel-generated vm id (0 on error) */
296 	if (write(fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
297 	    sizeof(vcp->vcp_id))
298 		fatal("write vcp id");
299 
300 	if (ret) {
301 		errno = ret;
302 		fatal("create vmm ioctl failed - exiting");
303 	}
304 
305 	/*
306 	 * pledge in the vm processes:
307 	 * stdio - for malloc and basic I/O including events.
308 	 * recvfd - for send/recv.
309 	 * vmm - for the vmm ioctls and operations.
310 	 */
311 	if (pledge("stdio vmm recvfd", NULL) == -1)
312 		fatal("pledge");
313 
314 	if (vm->vm_received) {
315 		ret = read(vm->vm_receive_fd, &vrp, sizeof(vrp));
316 		if (ret != sizeof(vrp)) {
317 			fatal("received incomplete vrp - exiting");
318 		}
319 		vrs = vrp.vrwp_regs;
320 	} else {
321 		/*
322 		 * Set up default "flat 64 bit" register state - RIP,
323 		 * RSP, and GDT info will be set in bootloader
324 		 */
325 		memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs));
326 
327 		/* Find and open kernel image */
328 		if ((fp = vmboot_open(vm->vm_kernel,
329 		    vm->vm_disks[0], vmc->vmc_diskbases[0],
330 		    vmc->vmc_disktypes[0], &vmboot)) == NULL)
331 			fatalx("failed to open kernel - exiting");
332 
333 		/* Load kernel image */
334 		ret = loadfile_elf(fp, vcp, &vrs,
335 		    vmboot.vbp_bootdev, vmboot.vbp_howto, vmc->vmc_bootdevice);
336 
337 		/*
338 		 * Try BIOS as a fallback (only if it was provided as an image
339 		 * with vm->vm_kernel and not loaded from the disk)
340 		 */
341 		if (ret && errno == ENOEXEC && vm->vm_kernel != -1)
342 			ret = loadfile_bios(fp, &vrs);
343 
344 		if (ret)
345 			fatal("failed to load kernel or BIOS - exiting");
346 
347 		vmboot_close(fp, &vmboot);
348 	}
349 
350 	if (vm->vm_kernel != -1)
351 		close(vm->vm_kernel);
352 
353 	con_fd = vm->vm_tty;
354 	if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
355 		fatal("failed to set nonblocking mode on console");
356 
357 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
358 		nicfds[i] = vm->vm_ifs[i].vif_fd;
359 
360 	event_init();
361 
362 	if (vm->vm_received) {
363 		restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
364 		    vm->vm_disks, vm->vm_cdrom);
365 		mc146818_start();
366 		restore_mem(vm->vm_receive_fd, vcp);
367 	}
368 
369 	if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
370 		fatal("setup vm pipe");
371 
372 	/* Execute the vcpu run loop(s) for this VM */
373 	ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs);
374 
375 	/* Ensure that any in-flight data is written back */
376 	virtio_shutdown(vm);
377 
378 	return (ret);
379 }
380 
381 /*
382  * vm_dispatch_vmm
383  *
384  * imsg callback for messages that are received from the vmm parent process.
385  */
386 void
387 vm_dispatch_vmm(int fd, short event, void *arg)
388 {
389 	struct vmd_vm		*vm = arg;
390 	struct vmop_result	 vmr;
391 	struct imsgev		*iev = &vm->vm_iev;
392 	struct imsgbuf		*ibuf = &iev->ibuf;
393 	struct imsg		 imsg;
394 	ssize_t			 n;
395 	int			 verbose;
396 
397 	if (event & EV_READ) {
398 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
399 			fatal("%s: imsg_read", __func__);
400 		if (n == 0)
401 			_exit(0);
402 	}
403 
404 	if (event & EV_WRITE) {
405 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
406 			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
407 		if (n == 0)
408 			_exit(0);
409 	}
410 
411 	for (;;) {
412 		if ((n = imsg_get(ibuf, &imsg)) == -1)
413 			fatal("%s: imsg_get", __func__);
414 		if (n == 0)
415 			break;
416 
417 #if DEBUG > 1
418 		log_debug("%s: got imsg %d from %s",
419 		    __func__, imsg.hdr.type,
420 		    vm->vm_params.vmc_params.vcp_name);
421 #endif
422 
423 		switch (imsg.hdr.type) {
424 		case IMSG_CTL_VERBOSE:
425 			IMSG_SIZE_CHECK(&imsg, &verbose);
426 			memcpy(&verbose, imsg.data, sizeof(verbose));
427 			log_setverbose(verbose);
428 			break;
429 		case IMSG_VMDOP_VM_SHUTDOWN:
430 			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
431 				_exit(0);
432 			break;
433 		case IMSG_VMDOP_VM_REBOOT:
434 			if (vmmci_ctl(VMMCI_REBOOT) == -1)
435 				_exit(0);
436 			break;
437 		case IMSG_VMDOP_PAUSE_VM:
438 			vmr.vmr_result = 0;
439 			vmr.vmr_id = vm->vm_vmid;
440 			pause_vm(&vm->vm_params.vmc_params);
441 			imsg_compose_event(&vm->vm_iev,
442 			    IMSG_VMDOP_PAUSE_VM_RESPONSE,
443 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
444 			    sizeof(vmr));
445 			break;
446 		case IMSG_VMDOP_UNPAUSE_VM:
447 			vmr.vmr_result = 0;
448 			vmr.vmr_id = vm->vm_vmid;
449 			unpause_vm(&vm->vm_params.vmc_params);
450 			imsg_compose_event(&vm->vm_iev,
451 			    IMSG_VMDOP_UNPAUSE_VM_RESPONSE,
452 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
453 			    sizeof(vmr));
454 			break;
455 		case IMSG_VMDOP_SEND_VM_REQUEST:
456 			vmr.vmr_id = vm->vm_vmid;
457 			vmr.vmr_result = send_vm(imsg.fd,
458 			    &vm->vm_params.vmc_params);
459 			imsg_compose_event(&vm->vm_iev,
460 			    IMSG_VMDOP_SEND_VM_RESPONSE,
461 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
462 			    sizeof(vmr));
463 			break;
464 		default:
465 			fatalx("%s: got invalid imsg %d from %s",
466 			    __func__, imsg.hdr.type,
467 			    vm->vm_params.vmc_params.vcp_name);
468 		}
469 		imsg_free(&imsg);
470 	}
471 	imsg_event_add(iev);
472 }
473 
474 /*
475  * vm_ctl
476  *
477  * Tell the vmm parent process to shutdown or reboot the VM and exit.
478  */
479 __dead void
480 vm_shutdown(unsigned int cmd)
481 {
482 	switch (cmd) {
483 	case VMMCI_NONE:
484 	case VMMCI_SHUTDOWN:
485 		(void)imsg_compose_event(&current_vm->vm_iev,
486 		    IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0);
487 		break;
488 	case VMMCI_REBOOT:
489 		(void)imsg_compose_event(&current_vm->vm_iev,
490 		    IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0);
491 		break;
492 	default:
493 		fatalx("invalid vm ctl command: %d", cmd);
494 	}
495 	imsg_flush(&current_vm->vm_iev.ibuf);
496 
497 	_exit(0);
498 }
499 
500 int
501 send_vm(int fd, struct vm_create_params *vcp)
502 {
503 	struct vm_rwregs_params	   vrp;
504 	struct vmop_create_params *vmc;
505 	struct vm_terminate_params vtp;
506 	unsigned int		   flags = 0;
507 	unsigned int		   i;
508 	int			   ret = 0;
509 	size_t			   sz;
510 
511 	if (dump_send_header(fd)) {
512 		log_info("%s: failed to send vm dump header", __func__);
513 		goto err;
514 	}
515 
516 	pause_vm(vcp);
517 
518 	vmc = calloc(1, sizeof(struct vmop_create_params));
519 	if (vmc == NULL) {
520 		log_warn("%s: calloc error geting vmc", __func__);
521 		ret = -1;
522 		goto err;
523 	}
524 
525 	flags |= VMOP_CREATE_MEMORY;
526 	memcpy(&vmc->vmc_params, &current_vm->vm_params, sizeof(struct
527 	    vmop_create_params));
528 	vmc->vmc_flags = flags;
529 	vrp.vrwp_vm_id = vcp->vcp_id;
530 	vrp.vrwp_mask = VM_RWREGS_ALL;
531 
532 	sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params));
533 	if (sz != sizeof(struct vmop_create_params)) {
534 		ret = -1;
535 		goto err;
536 	}
537 
538 	for (i = 0; i < vcp->vcp_ncpus; i++) {
539 		vrp.vrwp_vcpu_id = i;
540 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) {
541 			log_warn("%s: readregs failed", __func__);
542 			goto err;
543 		}
544 
545 		sz = atomicio(vwrite, fd, &vrp,
546 		    sizeof(struct vm_rwregs_params));
547 		if (sz != sizeof(struct vm_rwregs_params)) {
548 			log_warn("%s: dumping registers failed", __func__);
549 			ret = -1;
550 			goto err;
551 		}
552 	}
553 
554 	if ((ret = i8253_dump(fd)))
555 		goto err;
556 	if ((ret = i8259_dump(fd)))
557 		goto err;
558 	if ((ret = ns8250_dump(fd)))
559 		goto err;
560 	if ((ret = mc146818_dump(fd)))
561 		goto err;
562 	if ((ret = fw_cfg_dump(fd)))
563 		goto err;
564 	if ((ret = pci_dump(fd)))
565 		goto err;
566 	if ((ret = virtio_dump(fd)))
567 		goto err;
568 	if ((ret = dump_mem(fd, vcp)))
569 		goto err;
570 
571 	vtp.vtp_vm_id = vcp->vcp_id;
572 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) < 0) {
573 		log_warnx("%s: term IOC error: %d, %d", __func__,
574 		    errno, ENOENT);
575 	}
576 err:
577 	close(fd);
578 	if (ret)
579 		unpause_vm(vcp);
580 	return ret;
581 }
582 
583 int
584 dump_send_header(int fd) {
585 	struct vm_dump_header	   vmh;
586 	int			   i;
587 
588 	vmh.vmh_cpuids[0].code = 0x00;
589 	vmh.vmh_cpuids[0].leaf = 0x00;
590 
591 	vmh.vmh_cpuids[1].code = 0x01;
592 	vmh.vmh_cpuids[1].leaf = 0x00;
593 
594 	vmh.vmh_cpuids[2].code = 0x07;
595 	vmh.vmh_cpuids[2].leaf = 0x00;
596 
597 	vmh.vmh_cpuids[3].code = 0x0d;
598 	vmh.vmh_cpuids[3].leaf = 0x00;
599 
600 	vmh.vmh_cpuids[4].code = 0x80000001;
601 	vmh.vmh_cpuids[4].leaf = 0x00;
602 
603 	vmh.vmh_version = VM_DUMP_VERSION;
604 
605 	for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
606 		CPUID_LEAF(vmh.vmh_cpuids[i].code,
607 		    vmh.vmh_cpuids[i].leaf,
608 		    vmh.vmh_cpuids[i].a,
609 		    vmh.vmh_cpuids[i].b,
610 		    vmh.vmh_cpuids[i].c,
611 		    vmh.vmh_cpuids[i].d);
612 	}
613 
614 	if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
615 		return (-1);
616 
617 	return (0);
618 }
619 
620 int
621 dump_mem(int fd, struct vm_create_params *vcp)
622 {
623 	unsigned int	i;
624 	int		ret;
625 	struct		vm_mem_range *vmr;
626 
627 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
628 		vmr = &vcp->vcp_memranges[i];
629 		ret = dump_vmr(fd, vmr);
630 		if (ret)
631 			return ret;
632 	}
633 	return (0);
634 }
635 
636 void
637 restore_mem(int fd, struct vm_create_params *vcp)
638 {
639 	unsigned int	     i;
640 	struct vm_mem_range *vmr;
641 
642 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
643 		vmr = &vcp->vcp_memranges[i];
644 		restore_vmr(fd, vmr);
645 	}
646 }
647 
648 int
649 dump_vmr(int fd, struct vm_mem_range *vmr)
650 {
651 	size_t	rem = vmr->vmr_size, read=0;
652 	char	buf[PAGE_SIZE];
653 
654 	while (rem > 0) {
655 		if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) {
656 			log_warn("failed to read vmr");
657 			return (-1);
658 		}
659 		if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) {
660 			log_warn("failed to dump vmr");
661 			return (-1);
662 		}
663 		rem = rem - PAGE_SIZE;
664 		read = read + PAGE_SIZE;
665 	}
666 	return (0);
667 }
668 
669 void
670 restore_vmr(int fd, struct vm_mem_range *vmr)
671 {
672 	size_t	rem = vmr->vmr_size, wrote=0;
673 	char	buf[PAGE_SIZE];
674 
675 	while (rem > 0) {
676 		if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf))
677 			fatal("failed to restore vmr");
678 		if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE))
679 			fatal("failed to write vmr");
680 		rem = rem - PAGE_SIZE;
681 		wrote = wrote + PAGE_SIZE;
682 	}
683 }
684 
685 void
686 pause_vm(struct vm_create_params *vcp)
687 {
688 	if (current_vm->vm_paused)
689 		return;
690 
691 	current_vm->vm_paused = 1;
692 
693 	/* XXX: vcpu_run_loop is running in another thread and we have to wait
694 	 * for the vm to exit before returning */
695 	sleep(1);
696 
697 	i8253_stop();
698 	mc146818_stop();
699 }
700 
701 void
702 unpause_vm(struct vm_create_params *vcp)
703 {
704 	unsigned int n;
705 	if (!current_vm->vm_paused)
706 		return;
707 
708 	current_vm->vm_paused = 0;
709 
710 	i8253_start();
711 	mc146818_start();
712 	for (n = 0; n <= vcp->vcp_ncpus; n++)
713 		pthread_cond_broadcast(&vcpu_run_cond[n]);
714 }
715 
716 /*
717  * vcpu_reset
718  *
719  * Requests vmm(4) to reset the VCPUs in the indicated VM to
720  * the register state provided
721  *
722  * Parameters
723  *  vmid: VM ID to reset
724  *  vcpu_id: VCPU ID to reset
725  *  vrs: the register state to initialize
726  *
727  * Return values:
728  *  0: success
729  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
730  *      valid)
731  */
732 int
733 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
734 {
735 	struct vm_resetcpu_params vrp;
736 
737 	memset(&vrp, 0, sizeof(vrp));
738 	vrp.vrp_vm_id = vmid;
739 	vrp.vrp_vcpu_id = vcpu_id;
740 	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
741 
742 	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
743 
744 	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) < 0)
745 		return (errno);
746 
747 	return (0);
748 }
749 
750 /*
751  * create_memory_map
752  *
753  * Sets up the guest physical memory ranges that the VM can access.
754  *
755  * Parameters:
756  *  vcp: VM create parameters describing the VM whose memory map
757  *       is being created
758  *
759  * Return values:
760  *  nothing
761  */
762 void
763 create_memory_map(struct vm_create_params *vcp)
764 {
765 	size_t len, mem_bytes, mem_mb;
766 
767 	mem_mb = vcp->vcp_memranges[0].vmr_size;
768 	vcp->vcp_nmemranges = 0;
769 	if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE)
770 		return;
771 
772 	mem_bytes = mem_mb * 1024 * 1024;
773 
774 	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
775 	len = LOWMEM_KB * 1024;
776 	vcp->vcp_memranges[0].vmr_gpa = 0x0;
777 	vcp->vcp_memranges[0].vmr_size = len;
778 	mem_bytes -= len;
779 
780 	/*
781 	 * Second memory region: LOWMEM_KB - 1MB.
782 	 *
783 	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
784 	 * We have to add this region, because some systems
785 	 * unconditionally write to 0xb8000 (VGA RAM), and
786 	 * we need to make sure that vmm(4) permits accesses
787 	 * to it. So allocate guest memory for it.
788 	 */
789 	len = 0x100000 - LOWMEM_KB * 1024;
790 	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
791 	vcp->vcp_memranges[1].vmr_size = len;
792 	mem_bytes -= len;
793 
794 	/* Make sure that we do not place physical memory into MMIO ranges. */
795 	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000)
796 		len = VMM_PCI_MMIO_BAR_BASE - 0x100000;
797 	else
798 		len = mem_bytes;
799 
800 	/* Third memory region: 1MB - (1MB + len) */
801 	vcp->vcp_memranges[2].vmr_gpa = 0x100000;
802 	vcp->vcp_memranges[2].vmr_size = len;
803 	mem_bytes -= len;
804 
805 	if (mem_bytes > 0) {
806 		/* Fourth memory region for the remaining memory (if any) */
807 		vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
808 		vcp->vcp_memranges[3].vmr_size = mem_bytes;
809 		vcp->vcp_nmemranges = 4;
810 	} else
811 		vcp->vcp_nmemranges = 3;
812 }
813 
814 /*
815  * alloc_guest_mem
816  *
817  * Allocates memory for the guest.
818  * Instead of doing a single allocation with one mmap(), we allocate memory
819  * separately for every range for the following reasons:
820  * - ASLR for the individual ranges
821  * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
822  *   map the single mmap'd userspace memory to the individual guest physical
823  *   memory ranges, the underlying amap of the single mmap'd range would have
824  *   to allocate per-page reference counters. The reason is that the
825  *   individual guest physical ranges would reference the single mmap'd region
826  *   only partially. However, if every guest physical range has its own
827  *   corresponding mmap'd userspace allocation, there are no partial
828  *   references: every guest physical range fully references an mmap'd
829  *   range => no per-page reference counters have to be allocated.
830  *
831  * Return values:
832  *  0: success
833  *  !0: failure - errno indicating the source of the failure
834  */
835 int
836 alloc_guest_mem(struct vm_create_params *vcp)
837 {
838 	void *p;
839 	int ret;
840 	size_t i, j;
841 	struct vm_mem_range *vmr;
842 
843 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
844 		vmr = &vcp->vcp_memranges[i];
845 		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
846 		    MAP_PRIVATE | MAP_ANON, -1, 0);
847 		if (p == MAP_FAILED) {
848 			ret = errno;
849 			for (j = 0; j < i; j++) {
850 				vmr = &vcp->vcp_memranges[j];
851 				munmap((void *)vmr->vmr_va, vmr->vmr_size);
852 			}
853 
854 			return (ret);
855 		}
856 
857 		vmr->vmr_va = (vaddr_t)p;
858 	}
859 
860 	return (0);
861 }
862 
863 /*
864  * vmm_create_vm
865  *
866  * Requests vmm(4) to create a new VM using the supplied creation
867  * parameters. This operation results in the creation of the in-kernel
868  * structures for the VM, but does not start the VM's vcpu(s).
869  *
870  * Parameters:
871  *  vcp: vm_create_params struct containing the VM's desired creation
872  *      configuration
873  *
874  * Return values:
875  *  0: success
876  *  !0 : ioctl to vmm(4) failed
877  */
878 int
879 vmm_create_vm(struct vm_create_params *vcp)
880 {
881 	/* Sanity check arguments */
882 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
883 		return (EINVAL);
884 
885 	if (vcp->vcp_nmemranges == 0 ||
886 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
887 		return (EINVAL);
888 
889 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
890 		return (EINVAL);
891 
892 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
893 		return (EINVAL);
894 
895 	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) < 0)
896 		return (errno);
897 
898 	return (0);
899 }
900 
901 /*
902  * init_emulated_hw
903  *
904  * Initializes the userspace hardware emulation
905  */
906 void
907 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
908     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
909 {
910 	struct vm_create_params *vcp = &vmc->vmc_params;
911 	int i;
912 	uint64_t memlo, memhi;
913 
914 	/* Calculate memory size for NVRAM registers */
915 	memlo = memhi = 0;
916 	if (vcp->vcp_nmemranges > 2)
917 		memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000;
918 
919 	if (vcp->vcp_nmemranges > 3)
920 		memhi = vcp->vcp_memranges[3].vmr_size;
921 
922 	/* Reset the IO port map */
923 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
924 
925 	/* Init i8253 PIT */
926 	i8253_init(vcp->vcp_id);
927 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
928 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
929 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
930 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
931 	ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
932 
933 	/* Init mc146818 RTC */
934 	mc146818_init(vcp->vcp_id, memlo, memhi);
935 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
936 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
937 
938 	/* Init master and slave PICs */
939 	i8259_init();
940 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
941 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
942 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
943 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
944 	ioports_map[ELCR0] = vcpu_exit_elcr;
945 	ioports_map[ELCR1] = vcpu_exit_elcr;
946 
947 	/* Init ns8250 UART */
948 	ns8250_init(con_fd, vcp->vcp_id);
949 	for (i = COM1_DATA; i <= COM1_SCR; i++)
950 		ioports_map[i] = vcpu_exit_com;
951 
952 	/* Init QEMU fw_cfg interface */
953 	fw_cfg_init(vmc);
954 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
955 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
956 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
957 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
958 
959 	/* Initialize PCI */
960 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
961 		ioports_map[i] = vcpu_exit_pci;
962 
963 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
964 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
965 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
966 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
967 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
968 	pci_init();
969 
970 	/* Initialize virtio devices */
971 	virtio_init(current_vm, child_cdrom, child_disks, child_taps);
972 }
973 /*
974  * restore_emulated_hw
975  *
976  * Restores the userspace hardware emulation from fd
977  */
978 void
979 restore_emulated_hw(struct vm_create_params *vcp, int fd,
980     int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
981 {
982 	/* struct vm_create_params *vcp = &vmc->vmc_params; */
983 	int i;
984 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
985 
986 	/* Init i8253 PIT */
987 	i8253_restore(fd, vcp->vcp_id);
988 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
989 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
990 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
991 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
992 
993 	/* Init master and slave PICs */
994 	i8259_restore(fd);
995 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
996 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
997 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
998 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
999 
1000 	/* Init ns8250 UART */
1001 	ns8250_restore(fd, con_fd, vcp->vcp_id);
1002 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1003 		ioports_map[i] = vcpu_exit_com;
1004 
1005 	/* Init mc146818 RTC */
1006 	mc146818_restore(fd, vcp->vcp_id);
1007 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1008 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1009 
1010 	/* Init QEMU fw_cfg interface */
1011 	fw_cfg_restore(fd);
1012 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1013 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1014 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1015 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1016 
1017 	/* Initialize PCI */
1018 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
1019 		ioports_map[i] = vcpu_exit_pci;
1020 
1021 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1022 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1023 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1024 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1025 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1026 	pci_restore(fd);
1027 	virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
1028 }
1029 
1030 /*
1031  * run_vm
1032  *
1033  * Runs the VM whose creation parameters are specified in vcp
1034  *
1035  * Parameters:
1036  *  child_cdrom: previously-opened child ISO disk file descriptor
1037  *  child_disks: previously-opened child VM disk file file descriptors
1038  *  child_taps: previously-opened child tap file descriptors
1039  *  vmc: vmop_create_params struct containing the VM's desired creation
1040  *      configuration
1041  *  vrs: VCPU register state to initialize
1042  *
1043  * Return values:
1044  *  0: the VM exited normally
1045  *  !0 : the VM exited abnormally or failed to start
1046  */
1047 int
1048 run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK],
1049     int *child_taps, struct vmop_create_params *vmc,
1050     struct vcpu_reg_state *vrs)
1051 {
1052 	struct vm_create_params *vcp = &vmc->vmc_params;
1053 	struct vm_rwregs_params vregsp;
1054 	uint8_t evdone = 0;
1055 	size_t i;
1056 	int ret;
1057 	pthread_t *tid, evtid;
1058 	struct vm_run_params **vrp;
1059 	void *exit_status;
1060 
1061 	if (vcp == NULL)
1062 		return (EINVAL);
1063 
1064 	if (child_cdrom == -1 && strlen(vcp->vcp_cdrom))
1065 		return (EINVAL);
1066 
1067 	if (child_disks == NULL && vcp->vcp_ndisks != 0)
1068 		return (EINVAL);
1069 
1070 	if (child_taps == NULL && vcp->vcp_nnics != 0)
1071 		return (EINVAL);
1072 
1073 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
1074 		return (EINVAL);
1075 
1076 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
1077 		return (EINVAL);
1078 
1079 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
1080 		return (EINVAL);
1081 
1082 	if (vcp->vcp_nmemranges == 0 ||
1083 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1084 		return (EINVAL);
1085 
1086 	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
1087 	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
1088 	if (tid == NULL || vrp == NULL) {
1089 		log_warn("%s: memory allocation error - exiting.",
1090 		    __progname);
1091 		return (ENOMEM);
1092 	}
1093 
1094 	log_debug("%s: initializing hardware for vm %s", __func__,
1095 	    vcp->vcp_name);
1096 
1097 	if (!current_vm->vm_received)
1098 		init_emulated_hw(vmc, child_cdrom, child_disks, child_taps);
1099 
1100 	ret = pthread_mutex_init(&threadmutex, NULL);
1101 	if (ret) {
1102 		log_warn("%s: could not initialize thread state mutex",
1103 		    __func__);
1104 		return (ret);
1105 	}
1106 	ret = pthread_cond_init(&threadcond, NULL);
1107 	if (ret) {
1108 		log_warn("%s: could not initialize thread state "
1109 		    "condition variable", __func__);
1110 		return (ret);
1111 	}
1112 
1113 	mutex_lock(&threadmutex);
1114 
1115 	log_debug("%s: starting vcpu threads for vm %s", __func__,
1116 	    vcp->vcp_name);
1117 
1118 	/*
1119 	 * Create and launch one thread for each VCPU. These threads may
1120 	 * migrate between PCPUs over time; the need to reload CPU state
1121 	 * in such situations is detected and performed by vmm(4) in the
1122 	 * kernel.
1123 	 */
1124 	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
1125 		vrp[i] = malloc(sizeof(struct vm_run_params));
1126 		if (vrp[i] == NULL) {
1127 			log_warn("%s: memory allocation error - "
1128 			    "exiting.", __progname);
1129 			/* caller will exit, so skip freeing */
1130 			return (ENOMEM);
1131 		}
1132 		vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit));
1133 		if (vrp[i]->vrp_exit == NULL) {
1134 			log_warn("%s: memory allocation error - "
1135 			    "exiting.", __progname);
1136 			/* caller will exit, so skip freeing */
1137 			return (ENOMEM);
1138 		}
1139 		vrp[i]->vrp_vm_id = vcp->vcp_id;
1140 		vrp[i]->vrp_vcpu_id = i;
1141 
1142 		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
1143 			log_warnx("%s: cannot reset VCPU %zu - exiting.",
1144 			    __progname, i);
1145 			return (EIO);
1146 		}
1147 
1148 		/* once more because reset_cpu changes regs */
1149 		if (current_vm->vm_received) {
1150 			vregsp.vrwp_vm_id = vcp->vcp_id;
1151 			vregsp.vrwp_vcpu_id = i;
1152 			vregsp.vrwp_regs = *vrs;
1153 			vregsp.vrwp_mask = VM_RWREGS_ALL;
1154 			if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS,
1155 			    &vregsp)) < 0) {
1156 				log_warn("%s: writeregs failed", __func__);
1157 				return (ret);
1158 			}
1159 		}
1160 
1161 		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
1162 		if (ret) {
1163 			log_warnx("%s: cannot initialize cond var (%d)",
1164 			    __progname, ret);
1165 			return (ret);
1166 		}
1167 
1168 		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
1169 		if (ret) {
1170 			log_warnx("%s: cannot initialize mtx (%d)",
1171 			    __progname, ret);
1172 			return (ret);
1173 		}
1174 
1175 		vcpu_hlt[i] = 0;
1176 
1177 		/* Start each VCPU run thread at vcpu_run_loop */
1178 		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
1179 		if (ret) {
1180 			/* caller will _exit after this return */
1181 			ret = errno;
1182 			log_warn("%s: could not create vcpu thread %zu",
1183 			    __func__, i);
1184 			return (ret);
1185 		}
1186 	}
1187 
1188 	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
1189 	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
1190 	if (ret) {
1191 		errno = ret;
1192 		log_warn("%s: could not create event thread", __func__);
1193 		return (ret);
1194 	}
1195 
1196 	for (;;) {
1197 		ret = pthread_cond_wait(&threadcond, &threadmutex);
1198 		if (ret) {
1199 			log_warn("%s: waiting on thread state condition "
1200 			    "variable failed", __func__);
1201 			return (ret);
1202 		}
1203 
1204 		/*
1205 		 * Did a VCPU thread exit with an error? => return the first one
1206 		 */
1207 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1208 			if (vcpu_done[i] == 0)
1209 				continue;
1210 
1211 			if (pthread_join(tid[i], &exit_status)) {
1212 				log_warn("%s: failed to join thread %zd - "
1213 				    "exiting", __progname, i);
1214 				return (EIO);
1215 			}
1216 
1217 			ret = (intptr_t)exit_status;
1218 		}
1219 
1220 		/* Did the event thread exit? => return with an error */
1221 		if (evdone) {
1222 			if (pthread_join(evtid, &exit_status)) {
1223 				log_warn("%s: failed to join event thread - "
1224 				    "exiting", __progname);
1225 				return (EIO);
1226 			}
1227 
1228 			log_warnx("%s: vm %d event thread exited "
1229 			    "unexpectedly", __progname, vcp->vcp_id);
1230 			return (EIO);
1231 		}
1232 
1233 		/* Did all VCPU threads exit successfully? => return */
1234 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1235 			if (vcpu_done[i] == 0)
1236 				break;
1237 		}
1238 		if (i == vcp->vcp_ncpus)
1239 			return (ret);
1240 
1241 		/* Some more threads to wait for, start over */
1242 	}
1243 
1244 	return (ret);
1245 }
1246 
1247 void *
1248 event_thread(void *arg)
1249 {
1250 	uint8_t *donep = arg;
1251 	intptr_t ret;
1252 
1253 	ret = event_dispatch();
1254 
1255 	mutex_lock(&threadmutex);
1256 	*donep = 1;
1257 	pthread_cond_signal(&threadcond);
1258 	mutex_unlock(&threadmutex);
1259 
1260 	return (void *)ret;
1261  }
1262 
1263 /*
1264  * vcpu_run_loop
1265  *
1266  * Runs a single VCPU until vmm(4) requires help handling an exit,
1267  * or the VM terminates.
1268  *
1269  * Parameters:
1270  *  arg: vcpu_run_params for the VCPU being run by this thread
1271  *
1272  * Return values:
1273  *  NULL: the VCPU shutdown properly
1274  *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1275  */
1276 void *
1277 vcpu_run_loop(void *arg)
1278 {
1279 	struct vm_run_params *vrp = (struct vm_run_params *)arg;
1280 	intptr_t ret = 0;
1281 	int irq;
1282 	uint32_t n;
1283 
1284 	vrp->vrp_continue = 0;
1285 	n = vrp->vrp_vcpu_id;
1286 
1287 	for (;;) {
1288 		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1289 
1290 		if (ret) {
1291 			log_warnx("%s: can't lock vcpu run mtx (%d)",
1292 			    __func__, (int)ret);
1293 			return ((void *)ret);
1294 		}
1295 
1296 		/* If we are halted or paused, wait */
1297 		if (vcpu_hlt[n]) {
1298 			while (current_vm->vm_paused == 1) {
1299 				ret = pthread_cond_wait(&vcpu_run_cond[n],
1300 				    &vcpu_run_mtx[n]);
1301 				if (ret) {
1302 					log_warnx(
1303 					    "%s: can't wait on cond (%d)",
1304 					    __func__, (int)ret);
1305 					(void)pthread_mutex_unlock(
1306 					    &vcpu_run_mtx[n]);
1307 					break;
1308 				}
1309 			}
1310 			if (vcpu_hlt[n]) {
1311 				ret = pthread_cond_wait(&vcpu_run_cond[n],
1312 				    &vcpu_run_mtx[n]);
1313 
1314 				if (ret) {
1315 					log_warnx(
1316 					    "%s: can't wait on cond (%d)",
1317 					    __func__, (int)ret);
1318 					(void)pthread_mutex_unlock(
1319 					    &vcpu_run_mtx[n]);
1320 					break;
1321 				}
1322 			}
1323 		}
1324 
1325 		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1326 
1327 		if (ret) {
1328 			log_warnx("%s: can't unlock mutex on cond (%d)",
1329 			    __func__, (int)ret);
1330 			break;
1331 		}
1332 
1333 		if (vrp->vrp_irqready && i8259_is_pending()) {
1334 			irq = i8259_ack();
1335 			vrp->vrp_irq = irq;
1336 		} else
1337 			vrp->vrp_irq = 0xFFFF;
1338 
1339 		/* Still more pending? */
1340 		if (i8259_is_pending()) {
1341 			/* XXX can probably avoid ioctls here by providing intr in vrp */
1342 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1343 			    vrp->vrp_vcpu_id, 1)) {
1344 				fatal("can't set INTR");
1345 			}
1346 		} else {
1347 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1348 			    vrp->vrp_vcpu_id, 0)) {
1349 				fatal("can't clear INTR");
1350 			}
1351 		}
1352 
1353 		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) < 0) {
1354 			/* If run ioctl failed, exit */
1355 			ret = errno;
1356 			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1357 			    __func__, vrp->vrp_vm_id, n);
1358 			break;
1359 		}
1360 
1361 		/* If the VM is terminating, exit normally */
1362 		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
1363 			ret = (intptr_t)NULL;
1364 			break;
1365 		}
1366 
1367 		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
1368 			/*
1369 			 * vmm(4) needs help handling an exit, handle in
1370 			 * vcpu_exit.
1371 			 */
1372 			ret = vcpu_exit(vrp);
1373 			if (ret)
1374 				break;
1375 		}
1376 	}
1377 
1378 	mutex_lock(&threadmutex);
1379 	vcpu_done[n] = 1;
1380 	pthread_cond_signal(&threadcond);
1381 	mutex_unlock(&threadmutex);
1382 
1383 	return ((void *)ret);
1384 }
1385 
1386 int
1387 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1388 {
1389 	struct vm_intr_params vip;
1390 
1391 	memset(&vip, 0, sizeof(vip));
1392 
1393 	vip.vip_vm_id = vm_id;
1394 	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1395 	vip.vip_intr = intr;
1396 
1397 	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) < 0)
1398 		return (errno);
1399 
1400 	return (0);
1401 }
1402 
1403 /*
1404  * vcpu_exit_pci
1405  *
1406  * Handle all I/O to the emulated PCI subsystem.
1407  *
1408  * Parameters:
1409  *  vrp: vcpu run paramters containing guest state for this exit
1410  *
1411  * Return value:
1412  *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
1413  *      be injected.
1414  */
1415 uint8_t
1416 vcpu_exit_pci(struct vm_run_params *vrp)
1417 {
1418 	struct vm_exit *vei = vrp->vrp_exit;
1419 	uint8_t intr;
1420 
1421 	intr = 0xFF;
1422 
1423 	switch (vei->vei.vei_port) {
1424 	case PCI_MODE1_ADDRESS_REG:
1425 		pci_handle_address_reg(vrp);
1426 		break;
1427 	case PCI_MODE1_DATA_REG:
1428 	case PCI_MODE1_DATA_REG + 1:
1429 	case PCI_MODE1_DATA_REG + 2:
1430 	case PCI_MODE1_DATA_REG + 3:
1431 		pci_handle_data_reg(vrp);
1432 		break;
1433 	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
1434 		intr = pci_handle_io(vrp);
1435 		break;
1436 	default:
1437 		log_warnx("%s: unknown PCI register 0x%llx",
1438 		    __progname, (uint64_t)vei->vei.vei_port);
1439 		break;
1440 	}
1441 
1442 	return (intr);
1443 }
1444 
1445 /*
1446  * vcpu_exit_inout
1447  *
1448  * Handle all I/O exits that need to be emulated in vmd. This includes the
1449  * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
1450  *
1451  * Parameters:
1452  *  vrp: vcpu run parameters containing guest state for this exit
1453  */
1454 void
1455 vcpu_exit_inout(struct vm_run_params *vrp)
1456 {
1457 	struct vm_exit *vei = vrp->vrp_exit;
1458 	uint8_t intr = 0xFF;
1459 
1460 	if (ioports_map[vei->vei.vei_port] != NULL)
1461 		intr = ioports_map[vei->vei.vei_port](vrp);
1462 	else if (vei->vei.vei_dir == VEI_DIR_IN)
1463 			set_return_data(vei, 0xFFFFFFFF);
1464 
1465 	if (intr != 0xFF)
1466 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
1467 }
1468 
1469 /*
1470  * vcpu_exit
1471  *
1472  * Handle a vcpu exit. This function is called when it is determined that
1473  * vmm(4) requires the assistance of vmd to support a particular guest
1474  * exit type (eg, accessing an I/O port or device). Guest state is contained
1475  * in 'vrp', and will be resent to vmm(4) on exit completion.
1476  *
1477  * Upon conclusion of handling the exit, the function determines if any
1478  * interrupts should be injected into the guest, and asserts the proper
1479  * IRQ line whose interrupt should be vectored.
1480  *
1481  * Parameters:
1482  *  vrp: vcpu run parameters containing guest state for this exit
1483  *
1484  * Return values:
1485  *  0: the exit was handled successfully
1486  *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
1487  */
1488 int
1489 vcpu_exit(struct vm_run_params *vrp)
1490 {
1491 	int ret;
1492 
1493 	switch (vrp->vrp_exit_reason) {
1494 	case VMX_EXIT_INT_WINDOW:
1495 	case SVM_VMEXIT_VINTR:
1496 	case VMX_EXIT_CPUID:
1497 	case VMX_EXIT_EXTINT:
1498 	case SVM_VMEXIT_INTR:
1499 	case VMX_EXIT_EPT_VIOLATION:
1500 	case SVM_VMEXIT_NPF:
1501 	case SVM_VMEXIT_MSR:
1502 	case SVM_VMEXIT_CPUID:
1503 		/*
1504 		 * We may be exiting to vmd to handle a pending interrupt but
1505 		 * at the same time the last exit type may have been one of
1506 		 * these. In this case, there's nothing extra to be done
1507 		 * here (and falling through to the default case below results
1508 		 * in more vmd log spam).
1509 		 */
1510 		break;
1511 	case VMX_EXIT_IO:
1512 	case SVM_VMEXIT_IOIO:
1513 		vcpu_exit_inout(vrp);
1514 		break;
1515 	case VMX_EXIT_HLT:
1516 	case SVM_VMEXIT_HLT:
1517 		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1518 		if (ret) {
1519 			log_warnx("%s: can't lock vcpu mutex (%d)",
1520 			    __func__, ret);
1521 			return (ret);
1522 		}
1523 		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
1524 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1525 		if (ret) {
1526 			log_warnx("%s: can't unlock vcpu mutex (%d)",
1527 			    __func__, ret);
1528 			return (ret);
1529 		}
1530 		break;
1531 	case VMX_EXIT_TRIPLE_FAULT:
1532 	case SVM_VMEXIT_SHUTDOWN:
1533 		/* reset VM */
1534 		return (EAGAIN);
1535 	default:
1536 		log_debug("%s: unknown exit reason 0x%x",
1537 		    __progname, vrp->vrp_exit_reason);
1538 	}
1539 
1540 	/* Process any pending traffic */
1541 	vionet_process_rx(vrp->vrp_vm_id);
1542 
1543 	vrp->vrp_continue = 1;
1544 
1545 	return (0);
1546 }
1547 
1548 /*
1549  * find_gpa_range
1550  *
1551  * Search for a contiguous guest physical mem range.
1552  *
1553  * Parameters:
1554  *  vcp: VM create parameters that contain the memory map to search in
1555  *  gpa: the starting guest physical address
1556  *  len: the length of the memory range
1557  *
1558  * Return values:
1559  *  NULL: on failure if there is no memory range as described by the parameters
1560  *  Pointer to vm_mem_range that contains the start of the range otherwise.
1561  */
1562 static struct vm_mem_range *
1563 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
1564 {
1565 	size_t i, n;
1566 	struct vm_mem_range *vmr;
1567 
1568 	/* Find the first vm_mem_range that contains gpa */
1569 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1570 		vmr = &vcp->vcp_memranges[i];
1571 		if (vmr->vmr_gpa + vmr->vmr_size >= gpa)
1572 			break;
1573 	}
1574 
1575 	/* No range found. */
1576 	if (i == vcp->vcp_nmemranges)
1577 		return (NULL);
1578 
1579 	/*
1580 	 * vmr may cover the range [gpa, gpa + len) only partly. Make
1581 	 * sure that the following vm_mem_ranges are contiguous and
1582 	 * cover the rest.
1583 	 */
1584 	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
1585 	if (len < n)
1586 		len = 0;
1587 	else
1588 		len -= n;
1589 	gpa = vmr->vmr_gpa + vmr->vmr_size;
1590 	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
1591 		vmr = &vcp->vcp_memranges[i];
1592 		if (gpa != vmr->vmr_gpa)
1593 			return (NULL);
1594 		if (len <= vmr->vmr_size)
1595 			len = 0;
1596 		else
1597 			len -= vmr->vmr_size;
1598 
1599 		gpa = vmr->vmr_gpa + vmr->vmr_size;
1600 	}
1601 
1602 	if (len != 0)
1603 		return (NULL);
1604 
1605 	return (vmr);
1606 }
1607 
1608 void *
1609 vaddr_mem(paddr_t gpa, size_t len)
1610 {
1611 	struct vm_create_params *vcp = &current_vm->vm_params.vmc_params;
1612 	size_t i;
1613 	struct vm_mem_range *vmr;
1614 	paddr_t gpend = gpa + len;
1615 
1616 	/* Find the first vm_mem_range that contains gpa */
1617 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1618 		vmr = &vcp->vcp_memranges[i];
1619 		if (gpa < vmr->vmr_gpa)
1620 			continue;
1621 
1622 		if (gpend >= vmr->vmr_gpa + vmr->vmr_size)
1623 			continue;
1624 
1625 		return ((char *)vmr->vmr_va + (gpa - vmr->vmr_gpa));
1626 	}
1627 
1628 	return (NULL);
1629 }
1630 
1631 /*
1632  * write_mem
1633  *
1634  * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
1635  *
1636  * Parameters:
1637  *  dst: the destination paddr_t in the guest VM
1638  *  buf: data to copy (or NULL to zero the data)
1639  *  len: number of bytes to copy
1640  *
1641  * Return values:
1642  *  0: success
1643  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1644  *      exist in the guest.
1645  */
1646 int
1647 write_mem(paddr_t dst, const void *buf, size_t len)
1648 {
1649 	const char *from = buf;
1650 	char *to;
1651 	size_t n, off;
1652 	struct vm_mem_range *vmr;
1653 
1654 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
1655 	if (vmr == NULL) {
1656 		errno = EINVAL;
1657 		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
1658 		    "len = 0x%zx", __func__, dst, len);
1659 		return (EINVAL);
1660 	}
1661 
1662 	off = dst - vmr->vmr_gpa;
1663 	while (len != 0) {
1664 		n = vmr->vmr_size - off;
1665 		if (len < n)
1666 			n = len;
1667 
1668 		to = (char *)vmr->vmr_va + off;
1669 		if (buf == NULL)
1670 			memset(to, 0, n);
1671 		else {
1672 			memcpy(to, from, n);
1673 			from += n;
1674 		}
1675 		len -= n;
1676 		off = 0;
1677 		vmr++;
1678 	}
1679 
1680 	return (0);
1681 }
1682 
1683 /*
1684  * read_mem
1685  *
1686  * Reads memory at guest paddr 'src' into 'buf'.
1687  *
1688  * Parameters:
1689  *  src: the source paddr_t in the guest VM to read from.
1690  *  buf: destination (local) buffer
1691  *  len: number of bytes to read
1692  *
1693  * Return values:
1694  *  0: success
1695  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1696  *      exist in the guest.
1697  */
1698 int
1699 read_mem(paddr_t src, void *buf, size_t len)
1700 {
1701 	char *from, *to = buf;
1702 	size_t n, off;
1703 	struct vm_mem_range *vmr;
1704 
1705 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
1706 	if (vmr == NULL) {
1707 		errno = EINVAL;
1708 		log_warn("%s: failed - invalid memory range src = 0x%lx, "
1709 		    "len = 0x%zx", __func__, src, len);
1710 		return (EINVAL);
1711 	}
1712 
1713 	off = src - vmr->vmr_gpa;
1714 	while (len != 0) {
1715 		n = vmr->vmr_size - off;
1716 		if (len < n)
1717 			n = len;
1718 
1719 		from = (char *)vmr->vmr_va + off;
1720 		memcpy(to, from, n);
1721 
1722 		to += n;
1723 		len -= n;
1724 		off = 0;
1725 		vmr++;
1726 	}
1727 
1728 	return (0);
1729 }
1730 
1731 int
1732 iovec_mem(paddr_t src, size_t len, struct iovec *iov, int iovcnt)
1733 {
1734 	size_t n, off;
1735 	struct vm_mem_range *vmr;
1736 	int niov = 0;
1737 
1738 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
1739 	if (vmr == NULL) {
1740 		errno = EINVAL;
1741 		return (-1);
1742 	}
1743 
1744 	off = src - vmr->vmr_gpa;
1745 	while (len > 0) {
1746 		if (niov == iovcnt) {
1747 			errno = ENOMEM;
1748 			return (-1);
1749 		}
1750 
1751 		n = vmr->vmr_size - off;
1752 		if (len < n)
1753 			n = len;
1754 
1755 		iov[niov].iov_base = (char *)vmr->vmr_va + off;
1756 		iov[niov].iov_len = n;
1757 
1758 		niov++;
1759 
1760 		len -= n;
1761 		off = 0;
1762 		vmr++;
1763 	}
1764 
1765 	return (niov);
1766 }
1767 
1768 /*
1769  * vcpu_assert_pic_irq
1770  *
1771  * Injects the specified IRQ on the supplied vcpu/vm
1772  *
1773  * Parameters:
1774  *  vm_id: VM ID to inject to
1775  *  vcpu_id: VCPU ID to inject to
1776  *  irq: IRQ to inject
1777  */
1778 void
1779 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1780 {
1781 	int ret;
1782 
1783 	i8259_assert_irq(irq);
1784 
1785 	if (i8259_is_pending()) {
1786 		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
1787 			fatalx("%s: can't assert INTR", __func__);
1788 
1789 		ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
1790 		if (ret)
1791 			fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
1792 
1793 		vcpu_hlt[vcpu_id] = 0;
1794 		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
1795 		if (ret)
1796 			fatalx("%s: can't signal (%d)", __func__, ret);
1797 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
1798 		if (ret)
1799 			fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
1800 	}
1801 }
1802 
1803 /*
1804  * vcpu_deassert_pic_irq
1805  *
1806  * Clears the specified IRQ on the supplied vcpu/vm
1807  *
1808  * Parameters:
1809  *  vm_id: VM ID to clear in
1810  *  vcpu_id: VCPU ID to clear in
1811  *  irq: IRQ to clear
1812  */
1813 void
1814 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1815 {
1816 	i8259_deassert_irq(irq);
1817 
1818 	if (!i8259_is_pending()) {
1819 		if (vcpu_pic_intr(vm_id, vcpu_id, 0))
1820 			fatalx("%s: can't deassert INTR for vm_id %d, "
1821 			    "vcpu_id %d", __func__, vm_id, vcpu_id);
1822 	}
1823 }
1824 
1825 /*
1826  * fd_hasdata
1827  *
1828  * Determines if data can be read from a file descriptor.
1829  *
1830  * Parameters:
1831  *  fd: the fd to check
1832  *
1833  * Return values:
1834  *  1 if data can be read from an fd, or 0 otherwise.
1835  */
1836 int
1837 fd_hasdata(int fd)
1838 {
1839 	struct pollfd pfd[1];
1840 	int nready, hasdata = 0;
1841 
1842 	pfd[0].fd = fd;
1843 	pfd[0].events = POLLIN;
1844 	nready = poll(pfd, 1, 0);
1845 	if (nready == -1)
1846 		log_warn("checking file descriptor for data failed");
1847 	else if (nready == 1 && pfd[0].revents & POLLIN)
1848 		hasdata = 1;
1849 	return (hasdata);
1850 }
1851 
1852 /*
1853  * mutex_lock
1854  *
1855  * Wrapper function for pthread_mutex_lock that does error checking and that
1856  * exits on failure
1857  */
1858 void
1859 mutex_lock(pthread_mutex_t *m)
1860 {
1861 	int ret;
1862 
1863 	ret = pthread_mutex_lock(m);
1864 	if (ret) {
1865 		errno = ret;
1866 		fatal("could not acquire mutex");
1867 	}
1868 }
1869 
1870 /*
1871  * mutex_unlock
1872  *
1873  * Wrapper function for pthread_mutex_unlock that does error checking and that
1874  * exits on failure
1875  */
1876 void
1877 mutex_unlock(pthread_mutex_t *m)
1878 {
1879 	int ret;
1880 
1881 	ret = pthread_mutex_unlock(m);
1882 	if (ret) {
1883 		errno = ret;
1884 		fatal("could not release mutex");
1885 	}
1886 }
1887 
1888 /*
1889  * set_return_data
1890  *
1891  * Utility function for manipulating register data in vm exit info structs. This
1892  * function ensures that the data is copied to the vei->vei.vei_data field with
1893  * the proper size for the operation being performed.
1894  *
1895  * Parameters:
1896  *  vei: exit information
1897  *  data: return data
1898  */
1899 void
1900 set_return_data(struct vm_exit *vei, uint32_t data)
1901 {
1902 	switch (vei->vei.vei_size) {
1903 	case 1:
1904 		vei->vei.vei_data &= ~0xFF;
1905 		vei->vei.vei_data |= (uint8_t)data;
1906 		break;
1907 	case 2:
1908 		vei->vei.vei_data &= ~0xFFFF;
1909 		vei->vei.vei_data |= (uint16_t)data;
1910 		break;
1911 	case 4:
1912 		vei->vei.vei_data = data;
1913 		break;
1914 	}
1915 }
1916 
1917 /*
1918  * get_input_data
1919  *
1920  * Utility function for manipulating register data in vm exit info
1921  * structs. This function ensures that the data is copied from the
1922  * vei->vei.vei_data field with the proper size for the operation being
1923  * performed.
1924  *
1925  * Parameters:
1926  *  vei: exit information
1927  *  data: location to store the result
1928  */
1929 void
1930 get_input_data(struct vm_exit *vei, uint32_t *data)
1931 {
1932 	switch (vei->vei.vei_size) {
1933 	case 1:
1934 		*data &= 0xFFFFFF00;
1935 		*data |= (uint8_t)vei->vei.vei_data;
1936 		break;
1937 	case 2:
1938 		*data &= 0xFFFF0000;
1939 		*data |= (uint16_t)vei->vei.vei_data;
1940 		break;
1941 	case 4:
1942 		*data = vei->vei.vei_data;
1943 		break;
1944 	default:
1945 		log_warnx("%s: invalid i/o size %d", __func__,
1946 		    vei->vei.vei_size);
1947 	}
1948 
1949 }
1950