xref: /openbsd-src/usr.sbin/vmd/vm.c (revision f1dd7b858388b4a23f4f67a4957ec5ff656ebbe8)
1 /*	$OpenBSD: vm.c,v 1.62 2021/04/05 18:09:48 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/ioctl.h>
21 #include <sys/queue.h>
22 #include <sys/wait.h>
23 #include <sys/uio.h>
24 #include <sys/stat.h>
25 #include <sys/socket.h>
26 #include <sys/time.h>
27 #include <sys/mman.h>
28 
29 #include <dev/ic/i8253reg.h>
30 #include <dev/isa/isareg.h>
31 #include <dev/pci/pcireg.h>
32 
33 #include <machine/param.h>
34 #include <machine/psl.h>
35 #include <machine/pte.h>
36 #include <machine/specialreg.h>
37 #include <machine/vmmvar.h>
38 
39 #include <net/if.h>
40 
41 #include <errno.h>
42 #include <event.h>
43 #include <fcntl.h>
44 #include <imsg.h>
45 #include <limits.h>
46 #include <poll.h>
47 #include <pthread.h>
48 #include <stddef.h>
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <unistd.h>
53 #include <util.h>
54 
55 #include "vmd.h"
56 #include "vmm.h"
57 #include "loadfile.h"
58 #include "pci.h"
59 #include "virtio.h"
60 #include "proc.h"
61 #include "i8253.h"
62 #include "i8259.h"
63 #include "ns8250.h"
64 #include "mc146818.h"
65 #include "fw_cfg.h"
66 #include "atomicio.h"
67 
68 io_fn_t ioports_map[MAX_PORTS];
69 
70 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
71     struct vmop_create_params *, struct vcpu_reg_state *);
72 void vm_dispatch_vmm(int, short, void *);
73 void *event_thread(void *);
74 void *vcpu_run_loop(void *);
75 int vcpu_exit(struct vm_run_params *);
76 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
77 void create_memory_map(struct vm_create_params *);
78 int alloc_guest_mem(struct vm_create_params *);
79 int vmm_create_vm(struct vm_create_params *);
80 void init_emulated_hw(struct vmop_create_params *, int,
81     int[][VM_MAX_BASE_PER_DISK], int *);
82 void restore_emulated_hw(struct vm_create_params *, int, int *,
83     int[][VM_MAX_BASE_PER_DISK],int);
84 void vcpu_exit_inout(struct vm_run_params *);
85 int vcpu_exit_eptviolation(struct vm_run_params *);
86 uint8_t vcpu_exit_pci(struct vm_run_params *);
87 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
88 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
89 int send_vm(int, struct vm_create_params *);
90 int dump_send_header(int);
91 int dump_vmr(int , struct vm_mem_range *);
92 int dump_mem(int, struct vm_create_params *);
93 void restore_vmr(int, struct vm_mem_range *);
94 void restore_mem(int, struct vm_create_params *);
95 int restore_vm_params(int, struct vm_create_params *);
96 void pause_vm(struct vm_create_params *);
97 void unpause_vm(struct vm_create_params *);
98 
99 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
100 
101 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
102     size_t);
103 
104 int con_fd;
105 struct vmd_vm *current_vm;
106 
107 extern struct vmd *env;
108 
109 extern char *__progname;
110 
111 pthread_mutex_t threadmutex;
112 pthread_cond_t threadcond;
113 
114 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
115 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
116 pthread_barrier_t vm_pause_barrier;
117 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM];
118 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM];
119 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
120 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
121 
122 /*
123  * Represents a standard register set for an OS to be booted
124  * as a flat 64 bit address space.
125  *
126  * NOT set here are:
127  *  RIP
128  *  RSP
129  *  GDTR BASE
130  *
131  * Specific bootloaders should clone this structure and override
132  * those fields as needed.
133  *
134  * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
135  *        features of the CPU in use.
136  */
137 static const struct vcpu_reg_state vcpu_init_flat64 = {
138 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
139 	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
140 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
141 	.vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
142 	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
143 	.vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
144 	.vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
145 	.vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
146 	.vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
147 	.vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
148 	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
149 	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
150 	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
151 	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
152 	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
153 	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
154 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
155 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
156 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
157 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
158 	.vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
159 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
160 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
161 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
162 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
163 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
164 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
165 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
166 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
167 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
168 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
169 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
170 	.vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
171 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
172 };
173 
174 /*
175  * Represents a standard register set for an BIOS to be booted
176  * as a flat 16 bit address space.
177  */
178 static const struct vcpu_reg_state vcpu_init_flat16 = {
179 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
180 	.vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
181 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
182 	.vrs_crs[VCPU_REGS_CR0] = 0x60000010,
183 	.vrs_crs[VCPU_REGS_CR3] = 0,
184 	.vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
185 	.vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
186 	.vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
187 	.vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
188 	.vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
189 	.vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
190 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
191 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
192 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
193 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
194 	.vrs_msrs[VCPU_REGS_EFER] = 0ULL,
195 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
196 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
197 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
198 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
199 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
200 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
201 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
202 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
203 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
204 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
205 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
206 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
207 };
208 
209 /*
210  * loadfile_bios
211  *
212  * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
213  * directly into memory.
214  *
215  * Parameters:
216  *  fp: file of a kernel file to load
217  *  size: uncompressed size of the image
218  *  (out) vrs: register state to set on init for this kernel
219  *
220  * Return values:
221  *  0 if successful
222  *  various error codes returned from read(2) or loadelf functions
223  */
224 int
225 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
226 {
227 	off_t	 off;
228 
229 	/* Set up a "flat 16 bit" register state for BIOS */
230 	memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
231 
232 	/* Seek to the beginning of the BIOS image */
233 	if (gzseek(fp, 0, SEEK_SET) == -1)
234 		return (-1);
235 
236 	/* The BIOS image must end at 1M */
237 	if ((off = 1048576 - size) < 0)
238 		return (-1);
239 
240 	/* Read BIOS image into memory */
241 	if (mread(fp, off, size) != (size_t)size) {
242 		errno = EIO;
243 		return (-1);
244 	}
245 
246 	log_debug("%s: loaded BIOS image", __func__);
247 
248 	return (0);
249 }
250 
251 /*
252  * start_vm
253  *
254  * After forking a new VM process, starts the new VM with the creation
255  * parameters supplied (in the incoming vm->vm_params field). This
256  * function performs a basic sanity check on the incoming parameters
257  * and then performs the following steps to complete the creation of the VM:
258  *
259  * 1. validates and create the new VM
260  * 2. opens the imsg control channel to the parent and drops more privilege
261  * 3. drops additional privleges by calling pledge(2)
262  * 4. loads the kernel from the disk image or file descriptor
263  * 5. runs the VM's VCPU loops.
264  *
265  * Parameters:
266  *  vm: The VM data structure that is including the VM create parameters.
267  *  fd: The imsg socket that is connected to the parent process.
268  *
269  * Return values:
270  *  0: success
271  *  !0 : failure - typically an errno indicating the source of the failure
272  */
273 int
274 start_vm(struct vmd_vm *vm, int fd)
275 {
276 	struct vmop_create_params *vmc = &vm->vm_params;
277 	struct vm_create_params	*vcp = &vmc->vmc_params;
278 	struct vcpu_reg_state	 vrs;
279 	int			 nicfds[VMM_MAX_NICS_PER_VM];
280 	int			 ret;
281 	gzFile			 fp;
282 	size_t			 i;
283 	struct vm_rwregs_params  vrp;
284 	struct stat		 sb;
285 
286 	/* Child */
287 	setproctitle("%s", vcp->vcp_name);
288 	log_procinit(vcp->vcp_name);
289 
290 	if (!(vm->vm_state & VM_STATE_RECEIVED))
291 		create_memory_map(vcp);
292 
293 	ret = alloc_guest_mem(vcp);
294 
295 	if (ret) {
296 		errno = ret;
297 		fatal("could not allocate guest memory - exiting");
298 	}
299 
300 	ret = vmm_create_vm(vcp);
301 	current_vm = vm;
302 
303 	/* send back the kernel-generated vm id (0 on error) */
304 	if (write(fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
305 	    sizeof(vcp->vcp_id))
306 		fatal("write vcp id");
307 
308 	if (ret) {
309 		errno = ret;
310 		fatal("create vmm ioctl failed - exiting");
311 	}
312 
313 	/*
314 	 * pledge in the vm processes:
315 	 * stdio - for malloc and basic I/O including events.
316 	 * recvfd - for send/recv.
317 	 * vmm - for the vmm ioctls and operations.
318 	 */
319 	if (pledge("stdio vmm recvfd", NULL) == -1)
320 		fatal("pledge");
321 
322 	if (vm->vm_state & VM_STATE_RECEIVED) {
323 		ret = read(vm->vm_receive_fd, &vrp, sizeof(vrp));
324 		if (ret != sizeof(vrp)) {
325 			fatal("received incomplete vrp - exiting");
326 		}
327 		vrs = vrp.vrwp_regs;
328 	} else {
329 		/*
330 		 * Set up default "flat 64 bit" register state - RIP,
331 		 * RSP, and GDT info will be set in bootloader
332 		 */
333 		memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs));
334 
335 		/* Find and open kernel image */
336 		if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL)
337 			fatalx("failed to open kernel - exiting");
338 
339 		/* Load kernel image */
340 		ret = loadfile_elf(fp, vcp, &vrs);
341 
342 		/*
343 		 * Try BIOS as a fallback (only if it was provided as an image
344 		 * with vm->vm_kernel and the file is not compressed)
345 		 */
346 		if (ret && errno == ENOEXEC && vm->vm_kernel != -1 &&
347 		    gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
348 			ret = loadfile_bios(fp, sb.st_size, &vrs);
349 
350 		if (ret)
351 			fatal("failed to load kernel or BIOS - exiting");
352 
353 		gzclose(fp);
354 	}
355 
356 	if (vm->vm_kernel != -1)
357 		close(vm->vm_kernel);
358 
359 	con_fd = vm->vm_tty;
360 	if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
361 		fatal("failed to set nonblocking mode on console");
362 
363 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
364 		nicfds[i] = vm->vm_ifs[i].vif_fd;
365 
366 	event_init();
367 
368 	if (vm->vm_state & VM_STATE_RECEIVED) {
369 		restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
370 		    vm->vm_disks, vm->vm_cdrom);
371 		restore_mem(vm->vm_receive_fd, vcp);
372 		if (restore_vm_params(vm->vm_receive_fd, vcp))
373 			fatal("restore vm params failed");
374 		unpause_vm(vcp);
375 	}
376 
377 	if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
378 		fatal("setup vm pipe");
379 
380 	/* Execute the vcpu run loop(s) for this VM */
381 	ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs);
382 
383 	/* Ensure that any in-flight data is written back */
384 	virtio_shutdown(vm);
385 
386 	return (ret);
387 }
388 
389 /*
390  * vm_dispatch_vmm
391  *
392  * imsg callback for messages that are received from the vmm parent process.
393  */
394 void
395 vm_dispatch_vmm(int fd, short event, void *arg)
396 {
397 	struct vmd_vm		*vm = arg;
398 	struct vmop_result	 vmr;
399 	struct vmop_addr_result	 var;
400 	struct imsgev		*iev = &vm->vm_iev;
401 	struct imsgbuf		*ibuf = &iev->ibuf;
402 	struct imsg		 imsg;
403 	ssize_t			 n;
404 	int			 verbose;
405 
406 	if (event & EV_READ) {
407 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
408 			fatal("%s: imsg_read", __func__);
409 		if (n == 0)
410 			_exit(0);
411 	}
412 
413 	if (event & EV_WRITE) {
414 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
415 			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
416 		if (n == 0)
417 			_exit(0);
418 	}
419 
420 	for (;;) {
421 		if ((n = imsg_get(ibuf, &imsg)) == -1)
422 			fatal("%s: imsg_get", __func__);
423 		if (n == 0)
424 			break;
425 
426 #if DEBUG > 1
427 		log_debug("%s: got imsg %d from %s",
428 		    __func__, imsg.hdr.type,
429 		    vm->vm_params.vmc_params.vcp_name);
430 #endif
431 
432 		switch (imsg.hdr.type) {
433 		case IMSG_CTL_VERBOSE:
434 			IMSG_SIZE_CHECK(&imsg, &verbose);
435 			memcpy(&verbose, imsg.data, sizeof(verbose));
436 			log_setverbose(verbose);
437 			break;
438 		case IMSG_VMDOP_VM_SHUTDOWN:
439 			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
440 				_exit(0);
441 			break;
442 		case IMSG_VMDOP_VM_REBOOT:
443 			if (vmmci_ctl(VMMCI_REBOOT) == -1)
444 				_exit(0);
445 			break;
446 		case IMSG_VMDOP_PAUSE_VM:
447 			vmr.vmr_result = 0;
448 			vmr.vmr_id = vm->vm_vmid;
449 			pause_vm(&vm->vm_params.vmc_params);
450 			imsg_compose_event(&vm->vm_iev,
451 			    IMSG_VMDOP_PAUSE_VM_RESPONSE,
452 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
453 			    sizeof(vmr));
454 			break;
455 		case IMSG_VMDOP_UNPAUSE_VM:
456 			vmr.vmr_result = 0;
457 			vmr.vmr_id = vm->vm_vmid;
458 			unpause_vm(&vm->vm_params.vmc_params);
459 			imsg_compose_event(&vm->vm_iev,
460 			    IMSG_VMDOP_UNPAUSE_VM_RESPONSE,
461 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
462 			    sizeof(vmr));
463 			break;
464 		case IMSG_VMDOP_SEND_VM_REQUEST:
465 			vmr.vmr_id = vm->vm_vmid;
466 			vmr.vmr_result = send_vm(imsg.fd,
467 			    &vm->vm_params.vmc_params);
468 			imsg_compose_event(&vm->vm_iev,
469 			    IMSG_VMDOP_SEND_VM_RESPONSE,
470 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
471 			    sizeof(vmr));
472 			if (!vmr.vmr_result) {
473 				imsg_flush(&current_vm->vm_iev.ibuf);
474 				_exit(0);
475 			}
476 			break;
477 		case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
478 			IMSG_SIZE_CHECK(&imsg, &var);
479 			memcpy(&var, imsg.data, sizeof(var));
480 
481 			log_debug("%s: received tap addr %s for nic %d",
482 			    vm->vm_params.vmc_params.vcp_name,
483 			    ether_ntoa((void *)var.var_addr), var.var_nic_idx);
484 
485 			vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr);
486 			break;
487 		default:
488 			fatalx("%s: got invalid imsg %d from %s",
489 			    __func__, imsg.hdr.type,
490 			    vm->vm_params.vmc_params.vcp_name);
491 		}
492 		imsg_free(&imsg);
493 	}
494 	imsg_event_add(iev);
495 }
496 
497 /*
498  * vm_shutdown
499  *
500  * Tell the vmm parent process to shutdown or reboot the VM and exit.
501  */
502 __dead void
503 vm_shutdown(unsigned int cmd)
504 {
505 	switch (cmd) {
506 	case VMMCI_NONE:
507 	case VMMCI_SHUTDOWN:
508 		(void)imsg_compose_event(&current_vm->vm_iev,
509 		    IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0);
510 		break;
511 	case VMMCI_REBOOT:
512 		(void)imsg_compose_event(&current_vm->vm_iev,
513 		    IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0);
514 		break;
515 	default:
516 		fatalx("invalid vm ctl command: %d", cmd);
517 	}
518 	imsg_flush(&current_vm->vm_iev.ibuf);
519 
520 	_exit(0);
521 }
522 
523 int
524 send_vm(int fd, struct vm_create_params *vcp)
525 {
526 	struct vm_rwregs_params	   vrp;
527 	struct vm_rwvmparams_params vpp;
528 	struct vmop_create_params *vmc;
529 	struct vm_terminate_params vtp;
530 	unsigned int		   flags = 0;
531 	unsigned int		   i;
532 	int			   ret = 0;
533 	size_t			   sz;
534 
535 	if (dump_send_header(fd)) {
536 		log_info("%s: failed to send vm dump header", __func__);
537 		goto err;
538 	}
539 
540 	pause_vm(vcp);
541 
542 	vmc = calloc(1, sizeof(struct vmop_create_params));
543 	if (vmc == NULL) {
544 		log_warn("%s: calloc error geting vmc", __func__);
545 		ret = -1;
546 		goto err;
547 	}
548 
549 	flags |= VMOP_CREATE_MEMORY;
550 	memcpy(&vmc->vmc_params, &current_vm->vm_params, sizeof(struct
551 	    vmop_create_params));
552 	vmc->vmc_flags = flags;
553 	vrp.vrwp_vm_id = vcp->vcp_id;
554 	vrp.vrwp_mask = VM_RWREGS_ALL;
555 	vpp.vpp_mask = VM_RWVMPARAMS_ALL;
556 	vpp.vpp_vm_id = vcp->vcp_id;
557 
558 	sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params));
559 	if (sz != sizeof(struct vmop_create_params)) {
560 		ret = -1;
561 		goto err;
562 	}
563 
564 	for (i = 0; i < vcp->vcp_ncpus; i++) {
565 		vrp.vrwp_vcpu_id = i;
566 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) {
567 			log_warn("%s: readregs failed", __func__);
568 			goto err;
569 		}
570 
571 		sz = atomicio(vwrite, fd, &vrp,
572 		    sizeof(struct vm_rwregs_params));
573 		if (sz != sizeof(struct vm_rwregs_params)) {
574 			log_warn("%s: dumping registers failed", __func__);
575 			ret = -1;
576 			goto err;
577 		}
578 	}
579 
580 	if ((ret = i8253_dump(fd)))
581 		goto err;
582 	if ((ret = i8259_dump(fd)))
583 		goto err;
584 	if ((ret = ns8250_dump(fd)))
585 		goto err;
586 	if ((ret = mc146818_dump(fd)))
587 		goto err;
588 	if ((ret = fw_cfg_dump(fd)))
589 		goto err;
590 	if ((ret = pci_dump(fd)))
591 		goto err;
592 	if ((ret = virtio_dump(fd)))
593 		goto err;
594 	if ((ret = dump_mem(fd, vcp)))
595 		goto err;
596 
597 	for (i = 0; i < vcp->vcp_ncpus; i++) {
598 		vpp.vpp_vcpu_id = i;
599 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) {
600 			log_warn("%s: readvmparams failed", __func__);
601 			goto err;
602 		}
603 
604 		sz = atomicio(vwrite, fd, &vpp,
605 		    sizeof(struct vm_rwvmparams_params));
606 		if (sz != sizeof(struct vm_rwvmparams_params)) {
607 			log_warn("%s: dumping vm params failed", __func__);
608 			ret = -1;
609 			goto err;
610 		}
611 	}
612 
613 	vtp.vtp_vm_id = vcp->vcp_id;
614 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) {
615 		log_warnx("%s: term IOC error: %d, %d", __func__,
616 		    errno, ENOENT);
617 	}
618 err:
619 	close(fd);
620 	if (ret)
621 		unpause_vm(vcp);
622 	return ret;
623 }
624 
625 int
626 dump_send_header(int fd) {
627 	struct vm_dump_header	   vmh;
628 	int			   i;
629 
630 	memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
631 	    sizeof(vmh.vmh_signature));
632 
633 	vmh.vmh_cpuids[0].code = 0x00;
634 	vmh.vmh_cpuids[0].leaf = 0x00;
635 
636 	vmh.vmh_cpuids[1].code = 0x01;
637 	vmh.vmh_cpuids[1].leaf = 0x00;
638 
639 	vmh.vmh_cpuids[2].code = 0x07;
640 	vmh.vmh_cpuids[2].leaf = 0x00;
641 
642 	vmh.vmh_cpuids[3].code = 0x0d;
643 	vmh.vmh_cpuids[3].leaf = 0x00;
644 
645 	vmh.vmh_cpuids[4].code = 0x80000001;
646 	vmh.vmh_cpuids[4].leaf = 0x00;
647 
648 	vmh.vmh_version = VM_DUMP_VERSION;
649 
650 	for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
651 		CPUID_LEAF(vmh.vmh_cpuids[i].code,
652 		    vmh.vmh_cpuids[i].leaf,
653 		    vmh.vmh_cpuids[i].a,
654 		    vmh.vmh_cpuids[i].b,
655 		    vmh.vmh_cpuids[i].c,
656 		    vmh.vmh_cpuids[i].d);
657 	}
658 
659 	if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
660 		return (-1);
661 
662 	return (0);
663 }
664 
665 int
666 dump_mem(int fd, struct vm_create_params *vcp)
667 {
668 	unsigned int	i;
669 	int		ret;
670 	struct		vm_mem_range *vmr;
671 
672 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
673 		vmr = &vcp->vcp_memranges[i];
674 		ret = dump_vmr(fd, vmr);
675 		if (ret)
676 			return ret;
677 	}
678 	return (0);
679 }
680 
681 int
682 restore_vm_params(int fd, struct vm_create_params *vcp) {
683 	unsigned int			i;
684 	struct vm_rwvmparams_params    vpp;
685 
686 	for (i = 0; i < vcp->vcp_ncpus; i++) {
687 		if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) {
688 			log_warn("%s: error restoring vm params", __func__);
689 			return (-1);
690 		}
691 		vpp.vpp_vm_id = vcp->vcp_id;
692 		vpp.vpp_vcpu_id = i;
693 		if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) {
694 			log_debug("%s: writing vm params failed", __func__);
695 			return (-1);
696 		}
697 	}
698 	return (0);
699 }
700 
701 void
702 restore_mem(int fd, struct vm_create_params *vcp)
703 {
704 	unsigned int	     i;
705 	struct vm_mem_range *vmr;
706 
707 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
708 		vmr = &vcp->vcp_memranges[i];
709 		restore_vmr(fd, vmr);
710 	}
711 }
712 
713 int
714 dump_vmr(int fd, struct vm_mem_range *vmr)
715 {
716 	size_t	rem = vmr->vmr_size, read=0;
717 	char	buf[PAGE_SIZE];
718 
719 	while (rem > 0) {
720 		if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) {
721 			log_warn("failed to read vmr");
722 			return (-1);
723 		}
724 		if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) {
725 			log_warn("failed to dump vmr");
726 			return (-1);
727 		}
728 		rem = rem - PAGE_SIZE;
729 		read = read + PAGE_SIZE;
730 	}
731 	return (0);
732 }
733 
734 void
735 restore_vmr(int fd, struct vm_mem_range *vmr)
736 {
737 	size_t	rem = vmr->vmr_size, wrote=0;
738 	char	buf[PAGE_SIZE];
739 
740 	while (rem > 0) {
741 		if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf))
742 			fatal("failed to restore vmr");
743 		if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE))
744 			fatal("failed to write vmr");
745 		rem = rem - PAGE_SIZE;
746 		wrote = wrote + PAGE_SIZE;
747 	}
748 }
749 
750 void
751 pause_vm(struct vm_create_params *vcp)
752 {
753 	unsigned int n;
754 	int ret;
755 	if (current_vm->vm_state & VM_STATE_PAUSED)
756 		return;
757 
758 	current_vm->vm_state |= VM_STATE_PAUSED;
759 
760 	ret = pthread_barrier_init(&vm_pause_barrier, NULL, vcp->vcp_ncpus + 1);
761 	if (ret) {
762 		log_warnx("%s: cannot initialize pause barrier (%d)",
763 		    __progname, ret);
764 		return;
765 	}
766 
767 	for (n = 0; n < vcp->vcp_ncpus; n++) {
768 		ret = pthread_cond_broadcast(&vcpu_run_cond[n]);
769 		if (ret) {
770 			log_warnx("%s: can't broadcast vcpu run cond (%d)",
771 			    __func__, (int)ret);
772 			return;
773 		}
774 	}
775 	ret = pthread_barrier_wait(&vm_pause_barrier);
776 	if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
777 		log_warnx("%s: could not wait on pause barrier (%d)",
778 		    __func__, (int)ret);
779 		return;
780 	}
781 
782 	ret = pthread_barrier_destroy(&vm_pause_barrier);
783 	if (ret) {
784 		log_warnx("%s: could not destroy pause barrier (%d)",
785 		    __progname, ret);
786 		return;
787 	}
788 
789 	i8253_stop();
790 	mc146818_stop();
791 	ns8250_stop();
792 	virtio_stop(vcp);
793 }
794 
795 void
796 unpause_vm(struct vm_create_params *vcp)
797 {
798 	unsigned int n;
799 	int ret;
800 	if (!(current_vm->vm_state & VM_STATE_PAUSED))
801 		return;
802 
803 	current_vm->vm_state &= ~VM_STATE_PAUSED;
804 	for (n = 0; n < vcp->vcp_ncpus; n++) {
805 		ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]);
806 		if (ret) {
807 			log_warnx("%s: can't broadcast vcpu unpause cond (%d)",
808 			    __func__, (int)ret);
809 			return;
810 		}
811 	}
812 
813 	i8253_start();
814 	mc146818_start();
815 	ns8250_start();
816 	virtio_start(vcp);
817 }
818 
819 /*
820  * vcpu_reset
821  *
822  * Requests vmm(4) to reset the VCPUs in the indicated VM to
823  * the register state provided
824  *
825  * Parameters
826  *  vmid: VM ID to reset
827  *  vcpu_id: VCPU ID to reset
828  *  vrs: the register state to initialize
829  *
830  * Return values:
831  *  0: success
832  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
833  *      valid)
834  */
835 int
836 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
837 {
838 	struct vm_resetcpu_params vrp;
839 
840 	memset(&vrp, 0, sizeof(vrp));
841 	vrp.vrp_vm_id = vmid;
842 	vrp.vrp_vcpu_id = vcpu_id;
843 	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
844 
845 	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
846 
847 	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1)
848 		return (errno);
849 
850 	return (0);
851 }
852 
853 /*
854  * create_memory_map
855  *
856  * Sets up the guest physical memory ranges that the VM can access.
857  *
858  * Parameters:
859  *  vcp: VM create parameters describing the VM whose memory map
860  *       is being created
861  *
862  * Return values:
863  *  nothing
864  */
865 void
866 create_memory_map(struct vm_create_params *vcp)
867 {
868 	size_t len, mem_bytes, mem_mb;
869 
870 	mem_mb = vcp->vcp_memranges[0].vmr_size;
871 	vcp->vcp_nmemranges = 0;
872 	if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE)
873 		return;
874 
875 	mem_bytes = mem_mb * 1024 * 1024;
876 
877 	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
878 	len = LOWMEM_KB * 1024;
879 	vcp->vcp_memranges[0].vmr_gpa = 0x0;
880 	vcp->vcp_memranges[0].vmr_size = len;
881 	mem_bytes -= len;
882 
883 	/*
884 	 * Second memory region: LOWMEM_KB - 1MB.
885 	 *
886 	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
887 	 * We have to add this region, because some systems
888 	 * unconditionally write to 0xb8000 (VGA RAM), and
889 	 * we need to make sure that vmm(4) permits accesses
890 	 * to it. So allocate guest memory for it.
891 	 */
892 	len = 0x100000 - LOWMEM_KB * 1024;
893 	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
894 	vcp->vcp_memranges[1].vmr_size = len;
895 	mem_bytes -= len;
896 
897 	/* Make sure that we do not place physical memory into MMIO ranges. */
898 	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000)
899 		len = VMM_PCI_MMIO_BAR_BASE - 0x100000;
900 	else
901 		len = mem_bytes;
902 
903 	/* Third memory region: 1MB - (1MB + len) */
904 	vcp->vcp_memranges[2].vmr_gpa = 0x100000;
905 	vcp->vcp_memranges[2].vmr_size = len;
906 	mem_bytes -= len;
907 
908 	if (mem_bytes > 0) {
909 		/* Fourth memory region for the remaining memory (if any) */
910 		vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
911 		vcp->vcp_memranges[3].vmr_size = mem_bytes;
912 		vcp->vcp_nmemranges = 4;
913 	} else
914 		vcp->vcp_nmemranges = 3;
915 }
916 
917 /*
918  * alloc_guest_mem
919  *
920  * Allocates memory for the guest.
921  * Instead of doing a single allocation with one mmap(), we allocate memory
922  * separately for every range for the following reasons:
923  * - ASLR for the individual ranges
924  * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
925  *   map the single mmap'd userspace memory to the individual guest physical
926  *   memory ranges, the underlying amap of the single mmap'd range would have
927  *   to allocate per-page reference counters. The reason is that the
928  *   individual guest physical ranges would reference the single mmap'd region
929  *   only partially. However, if every guest physical range has its own
930  *   corresponding mmap'd userspace allocation, there are no partial
931  *   references: every guest physical range fully references an mmap'd
932  *   range => no per-page reference counters have to be allocated.
933  *
934  * Return values:
935  *  0: success
936  *  !0: failure - errno indicating the source of the failure
937  */
938 int
939 alloc_guest_mem(struct vm_create_params *vcp)
940 {
941 	void *p;
942 	int ret;
943 	size_t i, j;
944 	struct vm_mem_range *vmr;
945 
946 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
947 		vmr = &vcp->vcp_memranges[i];
948 		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
949 		    MAP_PRIVATE | MAP_ANON, -1, 0);
950 		if (p == MAP_FAILED) {
951 			ret = errno;
952 			for (j = 0; j < i; j++) {
953 				vmr = &vcp->vcp_memranges[j];
954 				munmap((void *)vmr->vmr_va, vmr->vmr_size);
955 			}
956 
957 			return (ret);
958 		}
959 
960 		vmr->vmr_va = (vaddr_t)p;
961 	}
962 
963 	return (0);
964 }
965 
966 /*
967  * vmm_create_vm
968  *
969  * Requests vmm(4) to create a new VM using the supplied creation
970  * parameters. This operation results in the creation of the in-kernel
971  * structures for the VM, but does not start the VM's vcpu(s).
972  *
973  * Parameters:
974  *  vcp: vm_create_params struct containing the VM's desired creation
975  *      configuration
976  *
977  * Return values:
978  *  0: success
979  *  !0 : ioctl to vmm(4) failed
980  */
981 int
982 vmm_create_vm(struct vm_create_params *vcp)
983 {
984 	/* Sanity check arguments */
985 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
986 		return (EINVAL);
987 
988 	if (vcp->vcp_nmemranges == 0 ||
989 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
990 		return (EINVAL);
991 
992 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
993 		return (EINVAL);
994 
995 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
996 		return (EINVAL);
997 
998 	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1)
999 		return (errno);
1000 
1001 	return (0);
1002 }
1003 
1004 /*
1005  * init_emulated_hw
1006  *
1007  * Initializes the userspace hardware emulation
1008  */
1009 void
1010 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
1011     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1012 {
1013 	struct vm_create_params *vcp = &vmc->vmc_params;
1014 	int i;
1015 	uint64_t memlo, memhi;
1016 
1017 	/* Calculate memory size for NVRAM registers */
1018 	memlo = memhi = 0;
1019 	if (vcp->vcp_nmemranges > 2)
1020 		memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000;
1021 
1022 	if (vcp->vcp_nmemranges > 3)
1023 		memhi = vcp->vcp_memranges[3].vmr_size;
1024 
1025 	/* Reset the IO port map */
1026 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1027 
1028 	/* Init i8253 PIT */
1029 	i8253_init(vcp->vcp_id);
1030 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1031 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1032 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1033 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1034 	ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
1035 
1036 	/* Init mc146818 RTC */
1037 	mc146818_init(vcp->vcp_id, memlo, memhi);
1038 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1039 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1040 
1041 	/* Init master and slave PICs */
1042 	i8259_init();
1043 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1044 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1045 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1046 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1047 	ioports_map[ELCR0] = vcpu_exit_elcr;
1048 	ioports_map[ELCR1] = vcpu_exit_elcr;
1049 
1050 	/* Init ns8250 UART */
1051 	ns8250_init(con_fd, vcp->vcp_id);
1052 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1053 		ioports_map[i] = vcpu_exit_com;
1054 
1055 	/* Init QEMU fw_cfg interface */
1056 	fw_cfg_init(vmc);
1057 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1058 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1059 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1060 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1061 
1062 	/* Initialize PCI */
1063 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
1064 		ioports_map[i] = vcpu_exit_pci;
1065 
1066 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1067 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1068 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1069 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1070 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1071 	pci_init();
1072 
1073 	/* Initialize virtio devices */
1074 	virtio_init(current_vm, child_cdrom, child_disks, child_taps);
1075 }
1076 /*
1077  * restore_emulated_hw
1078  *
1079  * Restores the userspace hardware emulation from fd
1080  */
1081 void
1082 restore_emulated_hw(struct vm_create_params *vcp, int fd,
1083     int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
1084 {
1085 	/* struct vm_create_params *vcp = &vmc->vmc_params; */
1086 	int i;
1087 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1088 
1089 	/* Init i8253 PIT */
1090 	i8253_restore(fd, vcp->vcp_id);
1091 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1092 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1093 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1094 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1095 
1096 	/* Init master and slave PICs */
1097 	i8259_restore(fd);
1098 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1099 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1100 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1101 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1102 
1103 	/* Init ns8250 UART */
1104 	ns8250_restore(fd, con_fd, vcp->vcp_id);
1105 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1106 		ioports_map[i] = vcpu_exit_com;
1107 
1108 	/* Init mc146818 RTC */
1109 	mc146818_restore(fd, vcp->vcp_id);
1110 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1111 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1112 
1113 	/* Init QEMU fw_cfg interface */
1114 	fw_cfg_restore(fd);
1115 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1116 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1117 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1118 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1119 
1120 	/* Initialize PCI */
1121 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
1122 		ioports_map[i] = vcpu_exit_pci;
1123 
1124 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1125 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1126 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1127 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1128 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1129 	pci_restore(fd);
1130 	virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
1131 }
1132 
1133 /*
1134  * run_vm
1135  *
1136  * Runs the VM whose creation parameters are specified in vcp
1137  *
1138  * Parameters:
1139  *  child_cdrom: previously-opened child ISO disk file descriptor
1140  *  child_disks: previously-opened child VM disk file file descriptors
1141  *  child_taps: previously-opened child tap file descriptors
1142  *  vmc: vmop_create_params struct containing the VM's desired creation
1143  *      configuration
1144  *  vrs: VCPU register state to initialize
1145  *
1146  * Return values:
1147  *  0: the VM exited normally
1148  *  !0 : the VM exited abnormally or failed to start
1149  */
1150 int
1151 run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK],
1152     int *child_taps, struct vmop_create_params *vmc,
1153     struct vcpu_reg_state *vrs)
1154 {
1155 	struct vm_create_params *vcp = &vmc->vmc_params;
1156 	struct vm_rwregs_params vregsp;
1157 	uint8_t evdone = 0;
1158 	size_t i;
1159 	int ret;
1160 	pthread_t *tid, evtid;
1161 	struct vm_run_params **vrp;
1162 	void *exit_status;
1163 
1164 	if (vcp == NULL)
1165 		return (EINVAL);
1166 
1167 	if (child_cdrom == -1 && strlen(vcp->vcp_cdrom))
1168 		return (EINVAL);
1169 
1170 	if (child_disks == NULL && vcp->vcp_ndisks != 0)
1171 		return (EINVAL);
1172 
1173 	if (child_taps == NULL && vcp->vcp_nnics != 0)
1174 		return (EINVAL);
1175 
1176 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
1177 		return (EINVAL);
1178 
1179 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
1180 		return (EINVAL);
1181 
1182 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
1183 		return (EINVAL);
1184 
1185 	if (vcp->vcp_nmemranges == 0 ||
1186 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1187 		return (EINVAL);
1188 
1189 	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
1190 	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
1191 	if (tid == NULL || vrp == NULL) {
1192 		log_warn("%s: memory allocation error - exiting.",
1193 		    __progname);
1194 		return (ENOMEM);
1195 	}
1196 
1197 	log_debug("%s: initializing hardware for vm %s", __func__,
1198 	    vcp->vcp_name);
1199 
1200 	if (!(current_vm->vm_state & VM_STATE_RECEIVED))
1201 		init_emulated_hw(vmc, child_cdrom, child_disks, child_taps);
1202 
1203 	ret = pthread_mutex_init(&threadmutex, NULL);
1204 	if (ret) {
1205 		log_warn("%s: could not initialize thread state mutex",
1206 		    __func__);
1207 		return (ret);
1208 	}
1209 	ret = pthread_cond_init(&threadcond, NULL);
1210 	if (ret) {
1211 		log_warn("%s: could not initialize thread state "
1212 		    "condition variable", __func__);
1213 		return (ret);
1214 	}
1215 
1216 	mutex_lock(&threadmutex);
1217 
1218 	log_debug("%s: starting vcpu threads for vm %s", __func__,
1219 	    vcp->vcp_name);
1220 
1221 	/*
1222 	 * Create and launch one thread for each VCPU. These threads may
1223 	 * migrate between PCPUs over time; the need to reload CPU state
1224 	 * in such situations is detected and performed by vmm(4) in the
1225 	 * kernel.
1226 	 */
1227 	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
1228 		vrp[i] = malloc(sizeof(struct vm_run_params));
1229 		if (vrp[i] == NULL) {
1230 			log_warn("%s: memory allocation error - "
1231 			    "exiting.", __progname);
1232 			/* caller will exit, so skip freeing */
1233 			return (ENOMEM);
1234 		}
1235 		vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit));
1236 		if (vrp[i]->vrp_exit == NULL) {
1237 			log_warn("%s: memory allocation error - "
1238 			    "exiting.", __progname);
1239 			/* caller will exit, so skip freeing */
1240 			return (ENOMEM);
1241 		}
1242 		vrp[i]->vrp_vm_id = vcp->vcp_id;
1243 		vrp[i]->vrp_vcpu_id = i;
1244 
1245 		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
1246 			log_warnx("%s: cannot reset VCPU %zu - exiting.",
1247 			    __progname, i);
1248 			return (EIO);
1249 		}
1250 
1251 		/* once more because reset_cpu changes regs */
1252 		if (current_vm->vm_state & VM_STATE_RECEIVED) {
1253 			vregsp.vrwp_vm_id = vcp->vcp_id;
1254 			vregsp.vrwp_vcpu_id = i;
1255 			vregsp.vrwp_regs = *vrs;
1256 			vregsp.vrwp_mask = VM_RWREGS_ALL;
1257 			if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS,
1258 			    &vregsp)) == -1) {
1259 				log_warn("%s: writeregs failed", __func__);
1260 				return (ret);
1261 			}
1262 		}
1263 
1264 		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
1265 		if (ret) {
1266 			log_warnx("%s: cannot initialize cond var (%d)",
1267 			    __progname, ret);
1268 			return (ret);
1269 		}
1270 
1271 		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
1272 		if (ret) {
1273 			log_warnx("%s: cannot initialize mtx (%d)",
1274 			    __progname, ret);
1275 			return (ret);
1276 		}
1277 
1278 		ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL);
1279 		if (ret) {
1280 			log_warnx("%s: cannot initialize unpause var (%d)",
1281 			    __progname, ret);
1282 			return (ret);
1283 		}
1284 
1285 		ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL);
1286 		if (ret) {
1287 			log_warnx("%s: cannot initialize unpause mtx (%d)",
1288 			    __progname, ret);
1289 			return (ret);
1290 		}
1291 
1292 		vcpu_hlt[i] = 0;
1293 
1294 		/* Start each VCPU run thread at vcpu_run_loop */
1295 		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
1296 		if (ret) {
1297 			/* caller will _exit after this return */
1298 			ret = errno;
1299 			log_warn("%s: could not create vcpu thread %zu",
1300 			    __func__, i);
1301 			return (ret);
1302 		}
1303 	}
1304 
1305 	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
1306 	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
1307 	if (ret) {
1308 		errno = ret;
1309 		log_warn("%s: could not create event thread", __func__);
1310 		return (ret);
1311 	}
1312 
1313 	for (;;) {
1314 		ret = pthread_cond_wait(&threadcond, &threadmutex);
1315 		if (ret) {
1316 			log_warn("%s: waiting on thread state condition "
1317 			    "variable failed", __func__);
1318 			return (ret);
1319 		}
1320 
1321 		/*
1322 		 * Did a VCPU thread exit with an error? => return the first one
1323 		 */
1324 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1325 			if (vcpu_done[i] == 0)
1326 				continue;
1327 
1328 			if (pthread_join(tid[i], &exit_status)) {
1329 				log_warn("%s: failed to join thread %zd - "
1330 				    "exiting", __progname, i);
1331 				return (EIO);
1332 			}
1333 
1334 			ret = (intptr_t)exit_status;
1335 		}
1336 
1337 		/* Did the event thread exit? => return with an error */
1338 		if (evdone) {
1339 			if (pthread_join(evtid, &exit_status)) {
1340 				log_warn("%s: failed to join event thread - "
1341 				    "exiting", __progname);
1342 				return (EIO);
1343 			}
1344 
1345 			log_warnx("%s: vm %d event thread exited "
1346 			    "unexpectedly", __progname, vcp->vcp_id);
1347 			return (EIO);
1348 		}
1349 
1350 		/* Did all VCPU threads exit successfully? => return */
1351 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1352 			if (vcpu_done[i] == 0)
1353 				break;
1354 		}
1355 		if (i == vcp->vcp_ncpus)
1356 			return (ret);
1357 
1358 		/* Some more threads to wait for, start over */
1359 	}
1360 
1361 	return (ret);
1362 }
1363 
1364 void *
1365 event_thread(void *arg)
1366 {
1367 	uint8_t *donep = arg;
1368 	intptr_t ret;
1369 
1370 	ret = event_dispatch();
1371 
1372 	mutex_lock(&threadmutex);
1373 	*donep = 1;
1374 	pthread_cond_signal(&threadcond);
1375 	mutex_unlock(&threadmutex);
1376 
1377 	return (void *)ret;
1378  }
1379 
1380 /*
1381  * vcpu_run_loop
1382  *
1383  * Runs a single VCPU until vmm(4) requires help handling an exit,
1384  * or the VM terminates.
1385  *
1386  * Parameters:
1387  *  arg: vcpu_run_params for the VCPU being run by this thread
1388  *
1389  * Return values:
1390  *  NULL: the VCPU shutdown properly
1391  *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1392  */
1393 void *
1394 vcpu_run_loop(void *arg)
1395 {
1396 	struct vm_run_params *vrp = (struct vm_run_params *)arg;
1397 	intptr_t ret = 0;
1398 	int irq;
1399 	uint32_t n;
1400 
1401 	vrp->vrp_continue = 0;
1402 	n = vrp->vrp_vcpu_id;
1403 
1404 	for (;;) {
1405 		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1406 
1407 		if (ret) {
1408 			log_warnx("%s: can't lock vcpu run mtx (%d)",
1409 			    __func__, (int)ret);
1410 			return ((void *)ret);
1411 		}
1412 
1413 		/* If we are halted and need to pause, pause */
1414 		if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) {
1415 			ret = pthread_barrier_wait(&vm_pause_barrier);
1416 			if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
1417 				log_warnx("%s: could not wait on pause barrier (%d)",
1418 				    __func__, (int)ret);
1419 				return ((void *)ret);
1420 			}
1421 
1422 			ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]);
1423 			if (ret) {
1424 				log_warnx("%s: can't lock vcpu unpause mtx (%d)",
1425 				    __func__, (int)ret);
1426 				return ((void *)ret);
1427 			}
1428 
1429 			ret = pthread_cond_wait(&vcpu_unpause_cond[n],
1430 			    &vcpu_unpause_mtx[n]);
1431 			if (ret) {
1432 				log_warnx(
1433 				    "%s: can't wait on unpause cond (%d)",
1434 				    __func__, (int)ret);
1435 				break;
1436 			}
1437 			ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]);
1438 			if (ret) {
1439 				log_warnx("%s: can't unlock unpause mtx (%d)",
1440 				    __func__, (int)ret);
1441 				break;
1442 			}
1443 		}
1444 
1445 		/* If we are halted and not paused, wait */
1446 		if (vcpu_hlt[n]) {
1447 			ret = pthread_cond_wait(&vcpu_run_cond[n],
1448 			    &vcpu_run_mtx[n]);
1449 
1450 			if (ret) {
1451 				log_warnx(
1452 				    "%s: can't wait on cond (%d)",
1453 				    __func__, (int)ret);
1454 				(void)pthread_mutex_unlock(
1455 				    &vcpu_run_mtx[n]);
1456 				break;
1457 			}
1458 		}
1459 
1460 		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1461 
1462 		if (ret) {
1463 			log_warnx("%s: can't unlock mutex on cond (%d)",
1464 			    __func__, (int)ret);
1465 			break;
1466 		}
1467 
1468 		if (vrp->vrp_irqready && i8259_is_pending()) {
1469 			irq = i8259_ack();
1470 			vrp->vrp_irq = irq;
1471 		} else
1472 			vrp->vrp_irq = 0xFFFF;
1473 
1474 		/* Still more pending? */
1475 		if (i8259_is_pending()) {
1476 			/*
1477 			 * XXX can probably avoid ioctls here by providing intr
1478 			 * in vrp
1479 			 */
1480 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1481 			    vrp->vrp_vcpu_id, 1)) {
1482 				fatal("can't set INTR");
1483 			}
1484 		} else {
1485 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1486 			    vrp->vrp_vcpu_id, 0)) {
1487 				fatal("can't clear INTR");
1488 			}
1489 		}
1490 
1491 		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) {
1492 			/* If run ioctl failed, exit */
1493 			ret = errno;
1494 			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1495 			    __func__, vrp->vrp_vm_id, n);
1496 			break;
1497 		}
1498 
1499 		/* If the VM is terminating, exit normally */
1500 		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
1501 			ret = (intptr_t)NULL;
1502 			break;
1503 		}
1504 
1505 		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
1506 			/*
1507 			 * vmm(4) needs help handling an exit, handle in
1508 			 * vcpu_exit.
1509 			 */
1510 			ret = vcpu_exit(vrp);
1511 			if (ret)
1512 				break;
1513 		}
1514 	}
1515 
1516 	mutex_lock(&threadmutex);
1517 	vcpu_done[n] = 1;
1518 	pthread_cond_signal(&threadcond);
1519 	mutex_unlock(&threadmutex);
1520 
1521 	return ((void *)ret);
1522 }
1523 
1524 int
1525 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1526 {
1527 	struct vm_intr_params vip;
1528 
1529 	memset(&vip, 0, sizeof(vip));
1530 
1531 	vip.vip_vm_id = vm_id;
1532 	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1533 	vip.vip_intr = intr;
1534 
1535 	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1)
1536 		return (errno);
1537 
1538 	return (0);
1539 }
1540 
1541 /*
1542  * vcpu_exit_pci
1543  *
1544  * Handle all I/O to the emulated PCI subsystem.
1545  *
1546  * Parameters:
1547  *  vrp: vcpu run paramters containing guest state for this exit
1548  *
1549  * Return value:
1550  *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
1551  *      be injected.
1552  */
1553 uint8_t
1554 vcpu_exit_pci(struct vm_run_params *vrp)
1555 {
1556 	struct vm_exit *vei = vrp->vrp_exit;
1557 	uint8_t intr;
1558 
1559 	intr = 0xFF;
1560 
1561 	switch (vei->vei.vei_port) {
1562 	case PCI_MODE1_ADDRESS_REG:
1563 		pci_handle_address_reg(vrp);
1564 		break;
1565 	case PCI_MODE1_DATA_REG:
1566 	case PCI_MODE1_DATA_REG + 1:
1567 	case PCI_MODE1_DATA_REG + 2:
1568 	case PCI_MODE1_DATA_REG + 3:
1569 		pci_handle_data_reg(vrp);
1570 		break;
1571 	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
1572 		intr = pci_handle_io(vrp);
1573 		break;
1574 	default:
1575 		log_warnx("%s: unknown PCI register 0x%llx",
1576 		    __progname, (uint64_t)vei->vei.vei_port);
1577 		break;
1578 	}
1579 
1580 	return (intr);
1581 }
1582 
1583 /*
1584  * vcpu_exit_inout
1585  *
1586  * Handle all I/O exits that need to be emulated in vmd. This includes the
1587  * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
1588  *
1589  * Parameters:
1590  *  vrp: vcpu run parameters containing guest state for this exit
1591  */
1592 void
1593 vcpu_exit_inout(struct vm_run_params *vrp)
1594 {
1595 	struct vm_exit *vei = vrp->vrp_exit;
1596 	uint8_t intr = 0xFF;
1597 
1598 	if (ioports_map[vei->vei.vei_port] != NULL)
1599 		intr = ioports_map[vei->vei.vei_port](vrp);
1600 	else if (vei->vei.vei_dir == VEI_DIR_IN)
1601 			set_return_data(vei, 0xFFFFFFFF);
1602 
1603 	if (intr != 0xFF)
1604 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
1605 }
1606 
1607 /*
1608  * vcpu_exit_eptviolation
1609  *
1610  * handle an EPT Violation
1611  *
1612  * Parameters:
1613  *  vrp: vcpu run parameters containing guest state for this exit
1614  *
1615  * Return values:
1616  *  0: no action required
1617  *  EAGAIN: a protection fault occured, kill the vm.
1618  */
1619 int
1620 vcpu_exit_eptviolation(struct vm_run_params *vrp)
1621 {
1622 	struct vm_exit *ve = vrp->vrp_exit;
1623 
1624 	/*
1625 	 * vmd may be exiting to vmd to handle a pending interrupt
1626 	 * but last exit type may have been VMX_EXIT_EPT_VIOLATION,
1627 	 * check the fault_type to ensure we really are processing
1628 	 * a VMX_EXIT_EPT_VIOLATION.
1629 	 */
1630 	if (ve->vee.vee_fault_type == VEE_FAULT_PROTECT) {
1631 		log_debug("%s: EPT Violation: rip=0x%llx",
1632 		    __progname, vrp->vrp_exit->vrs.vrs_gprs[VCPU_REGS_RIP]);
1633 		return (EAGAIN);
1634 	}
1635 
1636 	return (0);
1637 }
1638 
1639 /*
1640  * vcpu_exit
1641  *
1642  * Handle a vcpu exit. This function is called when it is determined that
1643  * vmm(4) requires the assistance of vmd to support a particular guest
1644  * exit type (eg, accessing an I/O port or device). Guest state is contained
1645  * in 'vrp', and will be resent to vmm(4) on exit completion.
1646  *
1647  * Upon conclusion of handling the exit, the function determines if any
1648  * interrupts should be injected into the guest, and asserts the proper
1649  * IRQ line whose interrupt should be vectored.
1650  *
1651  * Parameters:
1652  *  vrp: vcpu run parameters containing guest state for this exit
1653  *
1654  * Return values:
1655  *  0: the exit was handled successfully
1656  *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
1657  */
1658 int
1659 vcpu_exit(struct vm_run_params *vrp)
1660 {
1661 	int ret;
1662 
1663 	switch (vrp->vrp_exit_reason) {
1664 	case VMX_EXIT_INT_WINDOW:
1665 	case SVM_VMEXIT_VINTR:
1666 	case VMX_EXIT_CPUID:
1667 	case VMX_EXIT_EXTINT:
1668 	case SVM_VMEXIT_INTR:
1669 	case SVM_VMEXIT_NPF:
1670 	case SVM_VMEXIT_MSR:
1671 	case SVM_VMEXIT_CPUID:
1672 		/*
1673 		 * We may be exiting to vmd to handle a pending interrupt but
1674 		 * at the same time the last exit type may have been one of
1675 		 * these. In this case, there's nothing extra to be done
1676 		 * here (and falling through to the default case below results
1677 		 * in more vmd log spam).
1678 		 */
1679 		break;
1680 	case VMX_EXIT_EPT_VIOLATION:
1681 		ret = vcpu_exit_eptviolation(vrp);
1682 		if (ret)
1683 			return (ret);
1684 
1685 		break;
1686 	case VMX_EXIT_IO:
1687 	case SVM_VMEXIT_IOIO:
1688 		vcpu_exit_inout(vrp);
1689 		break;
1690 	case VMX_EXIT_HLT:
1691 	case SVM_VMEXIT_HLT:
1692 		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1693 		if (ret) {
1694 			log_warnx("%s: can't lock vcpu mutex (%d)",
1695 			    __func__, ret);
1696 			return (ret);
1697 		}
1698 		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
1699 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1700 		if (ret) {
1701 			log_warnx("%s: can't unlock vcpu mutex (%d)",
1702 			    __func__, ret);
1703 			return (ret);
1704 		}
1705 		break;
1706 	case VMX_EXIT_TRIPLE_FAULT:
1707 	case SVM_VMEXIT_SHUTDOWN:
1708 		/* reset VM */
1709 		return (EAGAIN);
1710 	default:
1711 		log_debug("%s: unknown exit reason 0x%x",
1712 		    __progname, vrp->vrp_exit_reason);
1713 	}
1714 
1715 	/* Process any pending traffic */
1716 	vionet_process_rx(vrp->vrp_vm_id);
1717 
1718 	vrp->vrp_continue = 1;
1719 
1720 	return (0);
1721 }
1722 
1723 /*
1724  * find_gpa_range
1725  *
1726  * Search for a contiguous guest physical mem range.
1727  *
1728  * Parameters:
1729  *  vcp: VM create parameters that contain the memory map to search in
1730  *  gpa: the starting guest physical address
1731  *  len: the length of the memory range
1732  *
1733  * Return values:
1734  *  NULL: on failure if there is no memory range as described by the parameters
1735  *  Pointer to vm_mem_range that contains the start of the range otherwise.
1736  */
1737 static struct vm_mem_range *
1738 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
1739 {
1740 	size_t i, n;
1741 	struct vm_mem_range *vmr;
1742 
1743 	/* Find the first vm_mem_range that contains gpa */
1744 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1745 		vmr = &vcp->vcp_memranges[i];
1746 		if (vmr->vmr_gpa + vmr->vmr_size >= gpa)
1747 			break;
1748 	}
1749 
1750 	/* No range found. */
1751 	if (i == vcp->vcp_nmemranges)
1752 		return (NULL);
1753 
1754 	/*
1755 	 * vmr may cover the range [gpa, gpa + len) only partly. Make
1756 	 * sure that the following vm_mem_ranges are contiguous and
1757 	 * cover the rest.
1758 	 */
1759 	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
1760 	if (len < n)
1761 		len = 0;
1762 	else
1763 		len -= n;
1764 	gpa = vmr->vmr_gpa + vmr->vmr_size;
1765 	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
1766 		vmr = &vcp->vcp_memranges[i];
1767 		if (gpa != vmr->vmr_gpa)
1768 			return (NULL);
1769 		if (len <= vmr->vmr_size)
1770 			len = 0;
1771 		else
1772 			len -= vmr->vmr_size;
1773 
1774 		gpa = vmr->vmr_gpa + vmr->vmr_size;
1775 	}
1776 
1777 	if (len != 0)
1778 		return (NULL);
1779 
1780 	return (vmr);
1781 }
1782 
1783 void *
1784 vaddr_mem(paddr_t gpa, size_t len)
1785 {
1786 	struct vm_create_params *vcp = &current_vm->vm_params.vmc_params;
1787 	size_t i;
1788 	struct vm_mem_range *vmr;
1789 	paddr_t gpend = gpa + len;
1790 
1791 	/* Find the first vm_mem_range that contains gpa */
1792 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1793 		vmr = &vcp->vcp_memranges[i];
1794 		if (gpa < vmr->vmr_gpa)
1795 			continue;
1796 
1797 		if (gpend >= vmr->vmr_gpa + vmr->vmr_size)
1798 			continue;
1799 
1800 		return ((char *)vmr->vmr_va + (gpa - vmr->vmr_gpa));
1801 	}
1802 
1803 	return (NULL);
1804 }
1805 
1806 /*
1807  * write_mem
1808  *
1809  * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
1810  *
1811  * Parameters:
1812  *  dst: the destination paddr_t in the guest VM
1813  *  buf: data to copy (or NULL to zero the data)
1814  *  len: number of bytes to copy
1815  *
1816  * Return values:
1817  *  0: success
1818  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1819  *      exist in the guest.
1820  */
1821 int
1822 write_mem(paddr_t dst, const void *buf, size_t len)
1823 {
1824 	const char *from = buf;
1825 	char *to;
1826 	size_t n, off;
1827 	struct vm_mem_range *vmr;
1828 
1829 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
1830 	if (vmr == NULL) {
1831 		errno = EINVAL;
1832 		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
1833 		    "len = 0x%zx", __func__, dst, len);
1834 		return (EINVAL);
1835 	}
1836 
1837 	off = dst - vmr->vmr_gpa;
1838 	while (len != 0) {
1839 		n = vmr->vmr_size - off;
1840 		if (len < n)
1841 			n = len;
1842 
1843 		to = (char *)vmr->vmr_va + off;
1844 		if (buf == NULL)
1845 			memset(to, 0, n);
1846 		else {
1847 			memcpy(to, from, n);
1848 			from += n;
1849 		}
1850 		len -= n;
1851 		off = 0;
1852 		vmr++;
1853 	}
1854 
1855 	return (0);
1856 }
1857 
1858 /*
1859  * read_mem
1860  *
1861  * Reads memory at guest paddr 'src' into 'buf'.
1862  *
1863  * Parameters:
1864  *  src: the source paddr_t in the guest VM to read from.
1865  *  buf: destination (local) buffer
1866  *  len: number of bytes to read
1867  *
1868  * Return values:
1869  *  0: success
1870  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1871  *      exist in the guest.
1872  */
1873 int
1874 read_mem(paddr_t src, void *buf, size_t len)
1875 {
1876 	char *from, *to = buf;
1877 	size_t n, off;
1878 	struct vm_mem_range *vmr;
1879 
1880 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
1881 	if (vmr == NULL) {
1882 		errno = EINVAL;
1883 		log_warn("%s: failed - invalid memory range src = 0x%lx, "
1884 		    "len = 0x%zx", __func__, src, len);
1885 		return (EINVAL);
1886 	}
1887 
1888 	off = src - vmr->vmr_gpa;
1889 	while (len != 0) {
1890 		n = vmr->vmr_size - off;
1891 		if (len < n)
1892 			n = len;
1893 
1894 		from = (char *)vmr->vmr_va + off;
1895 		memcpy(to, from, n);
1896 
1897 		to += n;
1898 		len -= n;
1899 		off = 0;
1900 		vmr++;
1901 	}
1902 
1903 	return (0);
1904 }
1905 
1906 int
1907 iovec_mem(paddr_t src, size_t len, struct iovec *iov, int iovcnt)
1908 {
1909 	size_t n, off;
1910 	struct vm_mem_range *vmr;
1911 	int niov = 0;
1912 
1913 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
1914 	if (vmr == NULL) {
1915 		errno = EINVAL;
1916 		return (-1);
1917 	}
1918 
1919 	off = src - vmr->vmr_gpa;
1920 	while (len > 0) {
1921 		if (niov == iovcnt) {
1922 			errno = ENOMEM;
1923 			return (-1);
1924 		}
1925 
1926 		n = vmr->vmr_size - off;
1927 		if (len < n)
1928 			n = len;
1929 
1930 		iov[niov].iov_base = (char *)vmr->vmr_va + off;
1931 		iov[niov].iov_len = n;
1932 
1933 		niov++;
1934 
1935 		len -= n;
1936 		off = 0;
1937 		vmr++;
1938 	}
1939 
1940 	return (niov);
1941 }
1942 
1943 /*
1944  * vcpu_assert_pic_irq
1945  *
1946  * Injects the specified IRQ on the supplied vcpu/vm
1947  *
1948  * Parameters:
1949  *  vm_id: VM ID to inject to
1950  *  vcpu_id: VCPU ID to inject to
1951  *  irq: IRQ to inject
1952  */
1953 void
1954 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1955 {
1956 	int ret;
1957 
1958 	i8259_assert_irq(irq);
1959 
1960 	if (i8259_is_pending()) {
1961 		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
1962 			fatalx("%s: can't assert INTR", __func__);
1963 
1964 		ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
1965 		if (ret)
1966 			fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
1967 
1968 		vcpu_hlt[vcpu_id] = 0;
1969 		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
1970 		if (ret)
1971 			fatalx("%s: can't signal (%d)", __func__, ret);
1972 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
1973 		if (ret)
1974 			fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
1975 	}
1976 }
1977 
1978 /*
1979  * vcpu_deassert_pic_irq
1980  *
1981  * Clears the specified IRQ on the supplied vcpu/vm
1982  *
1983  * Parameters:
1984  *  vm_id: VM ID to clear in
1985  *  vcpu_id: VCPU ID to clear in
1986  *  irq: IRQ to clear
1987  */
1988 void
1989 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1990 {
1991 	i8259_deassert_irq(irq);
1992 
1993 	if (!i8259_is_pending()) {
1994 		if (vcpu_pic_intr(vm_id, vcpu_id, 0))
1995 			fatalx("%s: can't deassert INTR for vm_id %d, "
1996 			    "vcpu_id %d", __func__, vm_id, vcpu_id);
1997 	}
1998 }
1999 
2000 /*
2001  * fd_hasdata
2002  *
2003  * Determines if data can be read from a file descriptor.
2004  *
2005  * Parameters:
2006  *  fd: the fd to check
2007  *
2008  * Return values:
2009  *  1 if data can be read from an fd, or 0 otherwise.
2010  */
2011 int
2012 fd_hasdata(int fd)
2013 {
2014 	struct pollfd pfd[1];
2015 	int nready, hasdata = 0;
2016 
2017 	pfd[0].fd = fd;
2018 	pfd[0].events = POLLIN;
2019 	nready = poll(pfd, 1, 0);
2020 	if (nready == -1)
2021 		log_warn("checking file descriptor for data failed");
2022 	else if (nready == 1 && pfd[0].revents & POLLIN)
2023 		hasdata = 1;
2024 	return (hasdata);
2025 }
2026 
2027 /*
2028  * mutex_lock
2029  *
2030  * Wrapper function for pthread_mutex_lock that does error checking and that
2031  * exits on failure
2032  */
2033 void
2034 mutex_lock(pthread_mutex_t *m)
2035 {
2036 	int ret;
2037 
2038 	ret = pthread_mutex_lock(m);
2039 	if (ret) {
2040 		errno = ret;
2041 		fatal("could not acquire mutex");
2042 	}
2043 }
2044 
2045 /*
2046  * mutex_unlock
2047  *
2048  * Wrapper function for pthread_mutex_unlock that does error checking and that
2049  * exits on failure
2050  */
2051 void
2052 mutex_unlock(pthread_mutex_t *m)
2053 {
2054 	int ret;
2055 
2056 	ret = pthread_mutex_unlock(m);
2057 	if (ret) {
2058 		errno = ret;
2059 		fatal("could not release mutex");
2060 	}
2061 }
2062 
2063 /*
2064  * set_return_data
2065  *
2066  * Utility function for manipulating register data in vm exit info structs. This
2067  * function ensures that the data is copied to the vei->vei.vei_data field with
2068  * the proper size for the operation being performed.
2069  *
2070  * Parameters:
2071  *  vei: exit information
2072  *  data: return data
2073  */
2074 void
2075 set_return_data(struct vm_exit *vei, uint32_t data)
2076 {
2077 	switch (vei->vei.vei_size) {
2078 	case 1:
2079 		vei->vei.vei_data &= ~0xFF;
2080 		vei->vei.vei_data |= (uint8_t)data;
2081 		break;
2082 	case 2:
2083 		vei->vei.vei_data &= ~0xFFFF;
2084 		vei->vei.vei_data |= (uint16_t)data;
2085 		break;
2086 	case 4:
2087 		vei->vei.vei_data = data;
2088 		break;
2089 	}
2090 }
2091 
2092 /*
2093  * get_input_data
2094  *
2095  * Utility function for manipulating register data in vm exit info
2096  * structs. This function ensures that the data is copied from the
2097  * vei->vei.vei_data field with the proper size for the operation being
2098  * performed.
2099  *
2100  * Parameters:
2101  *  vei: exit information
2102  *  data: location to store the result
2103  */
2104 void
2105 get_input_data(struct vm_exit *vei, uint32_t *data)
2106 {
2107 	switch (vei->vei.vei_size) {
2108 	case 1:
2109 		*data &= 0xFFFFFF00;
2110 		*data |= (uint8_t)vei->vei.vei_data;
2111 		break;
2112 	case 2:
2113 		*data &= 0xFFFF0000;
2114 		*data |= (uint16_t)vei->vei.vei_data;
2115 		break;
2116 	case 4:
2117 		*data = vei->vei.vei_data;
2118 		break;
2119 	default:
2120 		log_warnx("%s: invalid i/o size %d", __func__,
2121 		    vei->vei.vei_size);
2122 	}
2123 
2124 }
2125 
2126 /*
2127  * translate_gva
2128  *
2129  * Translates a guest virtual address to a guest physical address by walking
2130  * the currently active page table (if needed).
2131  *
2132  * Note - this function can possibly alter the supplied VCPU state.
2133  *  Specifically, it may inject exceptions depending on the current VCPU
2134  *  configuration, and may alter %cr2 on #PF. Consequently, this function
2135  *  should only be used as part of instruction emulation.
2136  *
2137  * Parameters:
2138  *  exit: The VCPU this translation should be performed for (guest MMU settings
2139  *   are gathered from this VCPU)
2140  *  va: virtual address to translate
2141  *  pa: pointer to paddr_t variable that will receive the translated physical
2142  *   address. 'pa' is unchanged on error.
2143  *  mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
2144  *   the address should be translated
2145  *
2146  * Return values:
2147  *  0: the address was successfully translated - 'pa' contains the physical
2148  *     address currently mapped by 'va'.
2149  *  EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
2150  *     and %cr2 set in the vcpu structure.
2151  *  EINVAL: an error occurred reading paging table structures
2152  */
2153 int
2154 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
2155 {
2156 	int level, shift, pdidx;
2157 	uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
2158 	uint64_t shift_width, pte_size;
2159 	struct vcpu_reg_state *vrs;
2160 
2161 	vrs = &exit->vrs;
2162 
2163 	if (!pa)
2164 		return (EINVAL);
2165 
2166 	if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
2167 		log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
2168 		*pa = va;
2169 		return (0);
2170 	}
2171 
2172 	pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
2173 
2174 	log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
2175 	    vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
2176 
2177 	if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
2178 		if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
2179 			pte_size = sizeof(uint64_t);
2180 			shift_width = 9;
2181 
2182 			if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
2183 				/* 4 level paging */
2184 				level = 4;
2185 				mask = L4_MASK;
2186 				shift = L4_SHIFT;
2187 			} else {
2188 				/* 32 bit with PAE paging */
2189 				level = 3;
2190 				mask = L3_MASK;
2191 				shift = L3_SHIFT;
2192 			}
2193 		} else {
2194 			/* 32 bit paging */
2195 			level = 2;
2196 			shift_width = 10;
2197 			mask = 0xFFC00000;
2198 			shift = 22;
2199 			pte_size = sizeof(uint32_t);
2200 		}
2201 	} else
2202 		return (EINVAL);
2203 
2204 	/* XXX: Check for R bit in segment selector and set A bit */
2205 
2206 	for (;level > 0; level--) {
2207 		pdidx = (va & mask) >> shift;
2208 		pte_paddr = (pt_paddr) + (pdidx * pte_size);
2209 
2210 		log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
2211 		    level, pte_paddr);
2212 		if (read_mem(pte_paddr, &pte, pte_size)) {
2213 			log_warn("%s: failed to read pte", __func__);
2214 			return (EFAULT);
2215 		}
2216 
2217 		log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
2218 		    pte);
2219 
2220 		/* XXX: Set CR2  */
2221 		if (!(pte & PG_V))
2222 			return (EFAULT);
2223 
2224 		/* XXX: Check for SMAP */
2225 		if ((mode == PROT_WRITE) && !(pte & PG_RW))
2226 			return (EPERM);
2227 
2228 		if ((exit->cpl > 0) && !(pte & PG_u))
2229 			return (EPERM);
2230 
2231 		pte = pte | PG_U;
2232 		if (mode == PROT_WRITE)
2233 			pte = pte | PG_M;
2234 		if (write_mem(pte_paddr, &pte, pte_size)) {
2235 			log_warn("%s: failed to write back flags to pte",
2236 			    __func__);
2237 			return (EIO);
2238 		}
2239 
2240 		/* XXX: EINVAL if in 32bit and  PG_PS is 1 but CR4.PSE is 0 */
2241 		if (pte & PG_PS)
2242 			break;
2243 
2244 		if (level > 1) {
2245 			pt_paddr = pte & PG_FRAME;
2246 			shift -= shift_width;
2247 			mask = mask >> shift_width;
2248 		}
2249 	}
2250 
2251 	low_mask = (1 << shift) - 1;
2252 	high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
2253 	*pa = (pte & high_mask) | (va & low_mask);
2254 
2255 	log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
2256 
2257 	return (0);
2258 }
2259 
2260 /*
2261  * vm_pipe_init
2262  *
2263  * Initialize a vm_dev_pipe, setting up its file descriptors and its
2264  * event structure with the given callback.
2265  *
2266  * Parameters:
2267  *  p: pointer to vm_dev_pipe struct to initizlize
2268  *  cb: callback to use for READ events on the read end of the pipe
2269  */
2270 void
2271 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *))
2272 {
2273 	int ret;
2274 	int fds[2];
2275 
2276 	memset(p, 0, sizeof(struct vm_dev_pipe));
2277 
2278 	ret = pipe(fds);
2279 	if (ret)
2280 		fatal("failed to create vm_dev_pipe pipe");
2281 
2282 	p->read = fds[0];
2283 	p->write = fds[1];
2284 
2285 	event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL);
2286 }
2287 
2288 /*
2289  * vm_pipe_send
2290  *
2291  * Send a message to an emulated device vie the provided vm_dev_pipe.
2292  *
2293  * Parameters:
2294  *  p: pointer to initialized vm_dev_pipe
2295  *  msg: message to send in the channel
2296  */
2297 void
2298 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg)
2299 {
2300 	size_t n;
2301 	n = write(p->write, &msg, sizeof(msg));
2302 	if (n != sizeof(msg))
2303 		fatal("failed to write to device pipe");
2304 }
2305 
2306 /*
2307  * vm_pipe_recv
2308  *
2309  * Receive a message for an emulated device via the provided vm_dev_pipe.
2310  * Returns the message value, otherwise will exit on failure.
2311  *
2312  * Parameters:
2313  *  p: pointer to initialized vm_dev_pipe
2314  *
2315  * Return values:
2316  *  a value of enum pipe_msg_type or fatal exit on read(2) error
2317  */
2318 enum pipe_msg_type
2319 vm_pipe_recv(struct vm_dev_pipe *p)
2320 {
2321 	size_t n;
2322 	enum pipe_msg_type msg;
2323 	n = read(p->read, &msg, sizeof(msg));
2324 	if (n != sizeof(msg))
2325 		fatal("failed to read from device pipe");
2326 
2327 	return msg;
2328 }
2329