xref: /openbsd-src/usr.sbin/vmd/vm.c (revision 25c4e8bd056e974b28f4a0ffd39d76c190a56013)
1 /*	$OpenBSD: vm.c,v 1.71 2022/06/29 17:39:54 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* PAGE_SIZE */
20 #include <sys/types.h>
21 #include <sys/ioctl.h>
22 #include <sys/queue.h>
23 #include <sys/wait.h>
24 #include <sys/uio.h>
25 #include <sys/stat.h>
26 #include <sys/socket.h>
27 #include <sys/time.h>
28 #include <sys/mman.h>
29 #include <sys/resource.h>
30 
31 #include <dev/ic/i8253reg.h>
32 #include <dev/isa/isareg.h>
33 #include <dev/pci/pcireg.h>
34 
35 #include <machine/psl.h>
36 #include <machine/pte.h>
37 #include <machine/specialreg.h>
38 #include <machine/vmmvar.h>
39 
40 #include <net/if.h>
41 
42 #include <errno.h>
43 #include <event.h>
44 #include <fcntl.h>
45 #include <imsg.h>
46 #include <limits.h>
47 #include <poll.h>
48 #include <pthread.h>
49 #include <stddef.h>
50 #include <stdio.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <unistd.h>
54 #include <util.h>
55 
56 #include "atomicio.h"
57 #include "fw_cfg.h"
58 #include "i8253.h"
59 #include "i8259.h"
60 #include "loadfile.h"
61 #include "mc146818.h"
62 #include "ns8250.h"
63 #include "pci.h"
64 #include "virtio.h"
65 #include "vmd.h"
66 #include "vmm.h"
67 
68 #define MB(x)	(x * 1024UL * 1024UL)
69 #define GB(x)	(x * 1024UL * 1024UL * 1024UL)
70 
71 io_fn_t ioports_map[MAX_PORTS];
72 
73 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
74     struct vmop_create_params *, struct vcpu_reg_state *);
75 void vm_dispatch_vmm(int, short, void *);
76 void *event_thread(void *);
77 void *vcpu_run_loop(void *);
78 int vcpu_exit(struct vm_run_params *);
79 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
80 void create_memory_map(struct vm_create_params *);
81 int alloc_guest_mem(struct vm_create_params *);
82 int vmm_create_vm(struct vm_create_params *);
83 void init_emulated_hw(struct vmop_create_params *, int,
84     int[][VM_MAX_BASE_PER_DISK], int *);
85 void restore_emulated_hw(struct vm_create_params *, int, int *,
86     int[][VM_MAX_BASE_PER_DISK],int);
87 void vcpu_exit_inout(struct vm_run_params *);
88 int vcpu_exit_eptviolation(struct vm_run_params *);
89 uint8_t vcpu_exit_pci(struct vm_run_params *);
90 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
91 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
92 int send_vm(int, struct vm_create_params *);
93 int dump_send_header(int);
94 int dump_vmr(int , struct vm_mem_range *);
95 int dump_mem(int, struct vm_create_params *);
96 void restore_vmr(int, struct vm_mem_range *);
97 void restore_mem(int, struct vm_create_params *);
98 int restore_vm_params(int, struct vm_create_params *);
99 void pause_vm(struct vm_create_params *);
100 void unpause_vm(struct vm_create_params *);
101 
102 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
103 
104 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
105     size_t);
106 
107 int con_fd;
108 struct vmd_vm *current_vm;
109 
110 extern struct vmd *env;
111 
112 extern char *__progname;
113 
114 pthread_mutex_t threadmutex;
115 pthread_cond_t threadcond;
116 
117 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
118 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
119 pthread_barrier_t vm_pause_barrier;
120 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM];
121 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM];
122 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
123 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
124 
125 /*
126  * Represents a standard register set for an OS to be booted
127  * as a flat 64 bit address space.
128  *
129  * NOT set here are:
130  *  RIP
131  *  RSP
132  *  GDTR BASE
133  *
134  * Specific bootloaders should clone this structure and override
135  * those fields as needed.
136  *
137  * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
138  *        features of the CPU in use.
139  */
140 static const struct vcpu_reg_state vcpu_init_flat64 = {
141 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
142 	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
143 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
144 	.vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
145 	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
146 	.vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
147 	.vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
148 	.vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
149 	.vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
150 	.vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
151 	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
152 	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
153 	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
154 	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
155 	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
156 	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
157 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
158 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
159 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
160 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
161 	.vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
162 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
163 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
164 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
165 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
166 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
167 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
168 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
169 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
170 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
171 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
172 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
173 	.vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
174 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
175 };
176 
177 /*
178  * Represents a standard register set for an BIOS to be booted
179  * as a flat 16 bit address space.
180  */
181 static const struct vcpu_reg_state vcpu_init_flat16 = {
182 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
183 	.vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
184 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
185 	.vrs_crs[VCPU_REGS_CR0] = 0x60000010,
186 	.vrs_crs[VCPU_REGS_CR3] = 0,
187 	.vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
188 	.vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
189 	.vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
190 	.vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
191 	.vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
192 	.vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
193 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
194 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
195 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
196 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
197 	.vrs_msrs[VCPU_REGS_EFER] = 0ULL,
198 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
199 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
200 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
201 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
202 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
203 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
204 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
205 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
206 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
207 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
208 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
209 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
210 };
211 
212 /*
213  * loadfile_bios
214  *
215  * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
216  * directly into memory.
217  *
218  * Parameters:
219  *  fp: file of a kernel file to load
220  *  size: uncompressed size of the image
221  *  (out) vrs: register state to set on init for this kernel
222  *
223  * Return values:
224  *  0 if successful
225  *  various error codes returned from read(2) or loadelf functions
226  */
227 int
228 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
229 {
230 	off_t	 off;
231 
232 	/* Set up a "flat 16 bit" register state for BIOS */
233 	memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
234 
235 	/* Seek to the beginning of the BIOS image */
236 	if (gzseek(fp, 0, SEEK_SET) == -1)
237 		return (-1);
238 
239 	/* The BIOS image must end at 1MB */
240 	if ((off = MB(1) - size) < 0)
241 		return (-1);
242 
243 	/* Read BIOS image into memory */
244 	if (mread(fp, off, size) != (size_t)size) {
245 		errno = EIO;
246 		return (-1);
247 	}
248 
249 	if (gzseek(fp, 0, SEEK_SET) == -1)
250 		return (-1);
251 
252 	/* Read a second BIOS copy into memory ending at 4GB */
253 	off = GB(4) - size;
254 	if (mread(fp, off, size) != (size_t)size) {
255 		errno = EIO;
256 		return (-1);
257 	}
258 
259 	log_debug("%s: loaded BIOS image", __func__);
260 
261 	return (0);
262 }
263 
264 /*
265  * start_vm
266  *
267  * After forking a new VM process, starts the new VM with the creation
268  * parameters supplied (in the incoming vm->vm_params field). This
269  * function performs a basic sanity check on the incoming parameters
270  * and then performs the following steps to complete the creation of the VM:
271  *
272  * 1. validates and create the new VM
273  * 2. opens the imsg control channel to the parent and drops more privilege
274  * 3. drops additional privleges by calling pledge(2)
275  * 4. loads the kernel from the disk image or file descriptor
276  * 5. runs the VM's VCPU loops.
277  *
278  * Parameters:
279  *  vm: The VM data structure that is including the VM create parameters.
280  *  fd: The imsg socket that is connected to the parent process.
281  *
282  * Return values:
283  *  0: success
284  *  !0 : failure - typically an errno indicating the source of the failure
285  */
286 int
287 start_vm(struct vmd_vm *vm, int fd)
288 {
289 	struct vmop_create_params *vmc = &vm->vm_params;
290 	struct vm_create_params	*vcp = &vmc->vmc_params;
291 	struct vcpu_reg_state	 vrs;
292 	int			 nicfds[VMM_MAX_NICS_PER_VM];
293 	int			 ret;
294 	gzFile			 fp;
295 	size_t			 i;
296 	struct vm_rwregs_params  vrp;
297 	struct stat		 sb;
298 
299 	/* Child */
300 	setproctitle("%s", vcp->vcp_name);
301 	log_procinit(vcp->vcp_name);
302 
303 	if (!(vm->vm_state & VM_STATE_RECEIVED))
304 		create_memory_map(vcp);
305 
306 	ret = alloc_guest_mem(vcp);
307 
308 	if (ret) {
309 		struct rlimit lim;
310 		char buf[FMT_SCALED_STRSIZE];
311 		if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) {
312 			if (fmt_scaled(lim.rlim_cur, buf) == 0)
313 				fatalx("could not allocate guest memory (data "
314 				    "limit is %s)", buf);
315 		}
316 		errno = ret;
317 		fatal("could not allocate guest memory");
318 	}
319 
320 	ret = vmm_create_vm(vcp);
321 	current_vm = vm;
322 
323 	/* send back the kernel-generated vm id (0 on error) */
324 	if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
325 	    sizeof(vcp->vcp_id))
326 		fatal("failed to send created vm id to vmm process");
327 
328 	if (ret) {
329 		errno = ret;
330 		fatal("create vmm ioctl failed - exiting");
331 	}
332 
333 	/*
334 	 * pledge in the vm processes:
335 	 * stdio - for malloc and basic I/O including events.
336 	 * recvfd - for send/recv.
337 	 * vmm - for the vmm ioctls and operations.
338 	 */
339 	if (pledge("stdio vmm recvfd", NULL) == -1)
340 		fatal("pledge");
341 
342 	if (vm->vm_state & VM_STATE_RECEIVED) {
343 		ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp));
344 		if (ret != sizeof(vrp))
345 			fatal("received incomplete vrp - exiting");
346 		vrs = vrp.vrwp_regs;
347 	} else {
348 		/*
349 		 * Set up default "flat 64 bit" register state - RIP,
350 		 * RSP, and GDT info will be set in bootloader
351 		 */
352 		memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs));
353 
354 		/* Find and open kernel image */
355 		if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL)
356 			fatalx("failed to open kernel - exiting");
357 
358 		/* Load kernel image */
359 		ret = loadfile_elf(fp, vcp, &vrs, vmc->vmc_bootdevice);
360 
361 		/*
362 		 * Try BIOS as a fallback (only if it was provided as an image
363 		 * with vm->vm_kernel and the file is not compressed)
364 		 */
365 		if (ret && errno == ENOEXEC && vm->vm_kernel != -1 &&
366 		    gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
367 			ret = loadfile_bios(fp, sb.st_size, &vrs);
368 
369 		if (ret)
370 			fatal("failed to load kernel or BIOS - exiting");
371 
372 		gzclose(fp);
373 	}
374 
375 	if (vm->vm_kernel != -1)
376 		close(vm->vm_kernel);
377 
378 	con_fd = vm->vm_tty;
379 	if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
380 		fatal("failed to set nonblocking mode on console");
381 
382 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
383 		nicfds[i] = vm->vm_ifs[i].vif_fd;
384 
385 	event_init();
386 
387 	if (vm->vm_state & VM_STATE_RECEIVED) {
388 		restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
389 		    vm->vm_disks, vm->vm_cdrom);
390 		restore_mem(vm->vm_receive_fd, vcp);
391 		if (restore_vm_params(vm->vm_receive_fd, vcp))
392 			fatal("restore vm params failed");
393 		unpause_vm(vcp);
394 	}
395 
396 	if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
397 		fatal("setup vm pipe");
398 
399 	/* Execute the vcpu run loop(s) for this VM */
400 	ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs);
401 
402 	/* Ensure that any in-flight data is written back */
403 	virtio_shutdown(vm);
404 
405 	return (ret);
406 }
407 
408 /*
409  * vm_dispatch_vmm
410  *
411  * imsg callback for messages that are received from the vmm parent process.
412  */
413 void
414 vm_dispatch_vmm(int fd, short event, void *arg)
415 {
416 	struct vmd_vm		*vm = arg;
417 	struct vmop_result	 vmr;
418 	struct vmop_addr_result	 var;
419 	struct imsgev		*iev = &vm->vm_iev;
420 	struct imsgbuf		*ibuf = &iev->ibuf;
421 	struct imsg		 imsg;
422 	ssize_t			 n;
423 	int			 verbose;
424 
425 	if (event & EV_READ) {
426 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
427 			fatal("%s: imsg_read", __func__);
428 		if (n == 0)
429 			_exit(0);
430 	}
431 
432 	if (event & EV_WRITE) {
433 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
434 			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
435 		if (n == 0)
436 			_exit(0);
437 	}
438 
439 	for (;;) {
440 		if ((n = imsg_get(ibuf, &imsg)) == -1)
441 			fatal("%s: imsg_get", __func__);
442 		if (n == 0)
443 			break;
444 
445 #if DEBUG > 1
446 		log_debug("%s: got imsg %d from %s",
447 		    __func__, imsg.hdr.type,
448 		    vm->vm_params.vmc_params.vcp_name);
449 #endif
450 
451 		switch (imsg.hdr.type) {
452 		case IMSG_CTL_VERBOSE:
453 			IMSG_SIZE_CHECK(&imsg, &verbose);
454 			memcpy(&verbose, imsg.data, sizeof(verbose));
455 			log_setverbose(verbose);
456 			break;
457 		case IMSG_VMDOP_VM_SHUTDOWN:
458 			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
459 				_exit(0);
460 			break;
461 		case IMSG_VMDOP_VM_REBOOT:
462 			if (vmmci_ctl(VMMCI_REBOOT) == -1)
463 				_exit(0);
464 			break;
465 		case IMSG_VMDOP_PAUSE_VM:
466 			vmr.vmr_result = 0;
467 			vmr.vmr_id = vm->vm_vmid;
468 			pause_vm(&vm->vm_params.vmc_params);
469 			imsg_compose_event(&vm->vm_iev,
470 			    IMSG_VMDOP_PAUSE_VM_RESPONSE,
471 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
472 			    sizeof(vmr));
473 			break;
474 		case IMSG_VMDOP_UNPAUSE_VM:
475 			vmr.vmr_result = 0;
476 			vmr.vmr_id = vm->vm_vmid;
477 			unpause_vm(&vm->vm_params.vmc_params);
478 			imsg_compose_event(&vm->vm_iev,
479 			    IMSG_VMDOP_UNPAUSE_VM_RESPONSE,
480 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
481 			    sizeof(vmr));
482 			break;
483 		case IMSG_VMDOP_SEND_VM_REQUEST:
484 			vmr.vmr_id = vm->vm_vmid;
485 			vmr.vmr_result = send_vm(imsg.fd,
486 			    &vm->vm_params.vmc_params);
487 			imsg_compose_event(&vm->vm_iev,
488 			    IMSG_VMDOP_SEND_VM_RESPONSE,
489 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
490 			    sizeof(vmr));
491 			if (!vmr.vmr_result) {
492 				imsg_flush(&current_vm->vm_iev.ibuf);
493 				_exit(0);
494 			}
495 			break;
496 		case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
497 			IMSG_SIZE_CHECK(&imsg, &var);
498 			memcpy(&var, imsg.data, sizeof(var));
499 
500 			log_debug("%s: received tap addr %s for nic %d",
501 			    vm->vm_params.vmc_params.vcp_name,
502 			    ether_ntoa((void *)var.var_addr), var.var_nic_idx);
503 
504 			vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr);
505 			break;
506 		default:
507 			fatalx("%s: got invalid imsg %d from %s",
508 			    __func__, imsg.hdr.type,
509 			    vm->vm_params.vmc_params.vcp_name);
510 		}
511 		imsg_free(&imsg);
512 	}
513 	imsg_event_add(iev);
514 }
515 
516 /*
517  * vm_shutdown
518  *
519  * Tell the vmm parent process to shutdown or reboot the VM and exit.
520  */
521 __dead void
522 vm_shutdown(unsigned int cmd)
523 {
524 	switch (cmd) {
525 	case VMMCI_NONE:
526 	case VMMCI_SHUTDOWN:
527 		(void)imsg_compose_event(&current_vm->vm_iev,
528 		    IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0);
529 		break;
530 	case VMMCI_REBOOT:
531 		(void)imsg_compose_event(&current_vm->vm_iev,
532 		    IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0);
533 		break;
534 	default:
535 		fatalx("invalid vm ctl command: %d", cmd);
536 	}
537 	imsg_flush(&current_vm->vm_iev.ibuf);
538 
539 	_exit(0);
540 }
541 
542 int
543 send_vm(int fd, struct vm_create_params *vcp)
544 {
545 	struct vm_rwregs_params	   vrp;
546 	struct vm_rwvmparams_params vpp;
547 	struct vmop_create_params *vmc;
548 	struct vm_terminate_params vtp;
549 	unsigned int		   flags = 0;
550 	unsigned int		   i;
551 	int			   ret = 0;
552 	size_t			   sz;
553 
554 	if (dump_send_header(fd)) {
555 		log_info("%s: failed to send vm dump header", __func__);
556 		goto err;
557 	}
558 
559 	pause_vm(vcp);
560 
561 	vmc = calloc(1, sizeof(struct vmop_create_params));
562 	if (vmc == NULL) {
563 		log_warn("%s: calloc error geting vmc", __func__);
564 		ret = -1;
565 		goto err;
566 	}
567 
568 	flags |= VMOP_CREATE_MEMORY;
569 	memcpy(&vmc->vmc_params, &current_vm->vm_params, sizeof(struct
570 	    vmop_create_params));
571 	vmc->vmc_flags = flags;
572 	vrp.vrwp_vm_id = vcp->vcp_id;
573 	vrp.vrwp_mask = VM_RWREGS_ALL;
574 	vpp.vpp_mask = VM_RWVMPARAMS_ALL;
575 	vpp.vpp_vm_id = vcp->vcp_id;
576 
577 	sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params));
578 	if (sz != sizeof(struct vmop_create_params)) {
579 		ret = -1;
580 		goto err;
581 	}
582 
583 	for (i = 0; i < vcp->vcp_ncpus; i++) {
584 		vrp.vrwp_vcpu_id = i;
585 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) {
586 			log_warn("%s: readregs failed", __func__);
587 			goto err;
588 		}
589 
590 		sz = atomicio(vwrite, fd, &vrp,
591 		    sizeof(struct vm_rwregs_params));
592 		if (sz != sizeof(struct vm_rwregs_params)) {
593 			log_warn("%s: dumping registers failed", __func__);
594 			ret = -1;
595 			goto err;
596 		}
597 	}
598 
599 	if ((ret = i8253_dump(fd)))
600 		goto err;
601 	if ((ret = i8259_dump(fd)))
602 		goto err;
603 	if ((ret = ns8250_dump(fd)))
604 		goto err;
605 	if ((ret = mc146818_dump(fd)))
606 		goto err;
607 	if ((ret = fw_cfg_dump(fd)))
608 		goto err;
609 	if ((ret = pci_dump(fd)))
610 		goto err;
611 	if ((ret = virtio_dump(fd)))
612 		goto err;
613 	if ((ret = dump_mem(fd, vcp)))
614 		goto err;
615 
616 	for (i = 0; i < vcp->vcp_ncpus; i++) {
617 		vpp.vpp_vcpu_id = i;
618 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) {
619 			log_warn("%s: readvmparams failed", __func__);
620 			goto err;
621 		}
622 
623 		sz = atomicio(vwrite, fd, &vpp,
624 		    sizeof(struct vm_rwvmparams_params));
625 		if (sz != sizeof(struct vm_rwvmparams_params)) {
626 			log_warn("%s: dumping vm params failed", __func__);
627 			ret = -1;
628 			goto err;
629 		}
630 	}
631 
632 	vtp.vtp_vm_id = vcp->vcp_id;
633 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) {
634 		log_warnx("%s: term IOC error: %d, %d", __func__,
635 		    errno, ENOENT);
636 	}
637 err:
638 	close(fd);
639 	if (ret)
640 		unpause_vm(vcp);
641 	return ret;
642 }
643 
644 int
645 dump_send_header(int fd) {
646 	struct vm_dump_header	   vmh;
647 	int			   i;
648 
649 	memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
650 	    sizeof(vmh.vmh_signature));
651 
652 	vmh.vmh_cpuids[0].code = 0x00;
653 	vmh.vmh_cpuids[0].leaf = 0x00;
654 
655 	vmh.vmh_cpuids[1].code = 0x01;
656 	vmh.vmh_cpuids[1].leaf = 0x00;
657 
658 	vmh.vmh_cpuids[2].code = 0x07;
659 	vmh.vmh_cpuids[2].leaf = 0x00;
660 
661 	vmh.vmh_cpuids[3].code = 0x0d;
662 	vmh.vmh_cpuids[3].leaf = 0x00;
663 
664 	vmh.vmh_cpuids[4].code = 0x80000001;
665 	vmh.vmh_cpuids[4].leaf = 0x00;
666 
667 	vmh.vmh_version = VM_DUMP_VERSION;
668 
669 	for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
670 		CPUID_LEAF(vmh.vmh_cpuids[i].code,
671 		    vmh.vmh_cpuids[i].leaf,
672 		    vmh.vmh_cpuids[i].a,
673 		    vmh.vmh_cpuids[i].b,
674 		    vmh.vmh_cpuids[i].c,
675 		    vmh.vmh_cpuids[i].d);
676 	}
677 
678 	if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
679 		return (-1);
680 
681 	return (0);
682 }
683 
684 int
685 dump_mem(int fd, struct vm_create_params *vcp)
686 {
687 	unsigned int	i;
688 	int		ret;
689 	struct		vm_mem_range *vmr;
690 
691 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
692 		vmr = &vcp->vcp_memranges[i];
693 		ret = dump_vmr(fd, vmr);
694 		if (ret)
695 			return ret;
696 	}
697 	return (0);
698 }
699 
700 int
701 restore_vm_params(int fd, struct vm_create_params *vcp) {
702 	unsigned int			i;
703 	struct vm_rwvmparams_params    vpp;
704 
705 	for (i = 0; i < vcp->vcp_ncpus; i++) {
706 		if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) {
707 			log_warn("%s: error restoring vm params", __func__);
708 			return (-1);
709 		}
710 		vpp.vpp_vm_id = vcp->vcp_id;
711 		vpp.vpp_vcpu_id = i;
712 		if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) {
713 			log_debug("%s: writing vm params failed", __func__);
714 			return (-1);
715 		}
716 	}
717 	return (0);
718 }
719 
720 void
721 restore_mem(int fd, struct vm_create_params *vcp)
722 {
723 	unsigned int	     i;
724 	struct vm_mem_range *vmr;
725 
726 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
727 		vmr = &vcp->vcp_memranges[i];
728 		restore_vmr(fd, vmr);
729 	}
730 }
731 
732 int
733 dump_vmr(int fd, struct vm_mem_range *vmr)
734 {
735 	size_t	rem = vmr->vmr_size, read=0;
736 	char	buf[PAGE_SIZE];
737 
738 	while (rem > 0) {
739 		if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) {
740 			log_warn("failed to read vmr");
741 			return (-1);
742 		}
743 		if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) {
744 			log_warn("failed to dump vmr");
745 			return (-1);
746 		}
747 		rem = rem - PAGE_SIZE;
748 		read = read + PAGE_SIZE;
749 	}
750 	return (0);
751 }
752 
753 void
754 restore_vmr(int fd, struct vm_mem_range *vmr)
755 {
756 	size_t	rem = vmr->vmr_size, wrote=0;
757 	char	buf[PAGE_SIZE];
758 
759 	while (rem > 0) {
760 		if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf))
761 			fatal("failed to restore vmr");
762 		if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE))
763 			fatal("failed to write vmr");
764 		rem = rem - PAGE_SIZE;
765 		wrote = wrote + PAGE_SIZE;
766 	}
767 }
768 
769 void
770 pause_vm(struct vm_create_params *vcp)
771 {
772 	unsigned int n;
773 	int ret;
774 	if (current_vm->vm_state & VM_STATE_PAUSED)
775 		return;
776 
777 	current_vm->vm_state |= VM_STATE_PAUSED;
778 
779 	ret = pthread_barrier_init(&vm_pause_barrier, NULL, vcp->vcp_ncpus + 1);
780 	if (ret) {
781 		log_warnx("%s: cannot initialize pause barrier (%d)",
782 		    __progname, ret);
783 		return;
784 	}
785 
786 	for (n = 0; n < vcp->vcp_ncpus; n++) {
787 		ret = pthread_cond_broadcast(&vcpu_run_cond[n]);
788 		if (ret) {
789 			log_warnx("%s: can't broadcast vcpu run cond (%d)",
790 			    __func__, (int)ret);
791 			return;
792 		}
793 	}
794 	ret = pthread_barrier_wait(&vm_pause_barrier);
795 	if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
796 		log_warnx("%s: could not wait on pause barrier (%d)",
797 		    __func__, (int)ret);
798 		return;
799 	}
800 
801 	ret = pthread_barrier_destroy(&vm_pause_barrier);
802 	if (ret) {
803 		log_warnx("%s: could not destroy pause barrier (%d)",
804 		    __progname, ret);
805 		return;
806 	}
807 
808 	i8253_stop();
809 	mc146818_stop();
810 	ns8250_stop();
811 	virtio_stop(vcp);
812 }
813 
814 void
815 unpause_vm(struct vm_create_params *vcp)
816 {
817 	unsigned int n;
818 	int ret;
819 	if (!(current_vm->vm_state & VM_STATE_PAUSED))
820 		return;
821 
822 	current_vm->vm_state &= ~VM_STATE_PAUSED;
823 	for (n = 0; n < vcp->vcp_ncpus; n++) {
824 		ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]);
825 		if (ret) {
826 			log_warnx("%s: can't broadcast vcpu unpause cond (%d)",
827 			    __func__, (int)ret);
828 			return;
829 		}
830 	}
831 
832 	i8253_start();
833 	mc146818_start();
834 	ns8250_start();
835 	virtio_start(vcp);
836 }
837 
838 /*
839  * vcpu_reset
840  *
841  * Requests vmm(4) to reset the VCPUs in the indicated VM to
842  * the register state provided
843  *
844  * Parameters
845  *  vmid: VM ID to reset
846  *  vcpu_id: VCPU ID to reset
847  *  vrs: the register state to initialize
848  *
849  * Return values:
850  *  0: success
851  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
852  *      valid)
853  */
854 int
855 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
856 {
857 	struct vm_resetcpu_params vrp;
858 
859 	memset(&vrp, 0, sizeof(vrp));
860 	vrp.vrp_vm_id = vmid;
861 	vrp.vrp_vcpu_id = vcpu_id;
862 	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
863 
864 	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
865 
866 	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1)
867 		return (errno);
868 
869 	return (0);
870 }
871 
872 /*
873  * create_memory_map
874  *
875  * Sets up the guest physical memory ranges that the VM can access.
876  *
877  * Parameters:
878  *  vcp: VM create parameters describing the VM whose memory map
879  *       is being created
880  *
881  * Return values:
882  *  nothing
883  */
884 void
885 create_memory_map(struct vm_create_params *vcp)
886 {
887 	size_t len, mem_bytes;
888 	size_t above_1m = 0, above_4g = 0;
889 
890 	mem_bytes = vcp->vcp_memranges[0].vmr_size;
891 	vcp->vcp_nmemranges = 0;
892 	if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE)
893 		return;
894 
895 	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
896 	len = LOWMEM_KB * 1024;
897 	vcp->vcp_memranges[0].vmr_gpa = 0x0;
898 	vcp->vcp_memranges[0].vmr_size = len;
899 	mem_bytes -= len;
900 
901 	/*
902 	 * Second memory region: LOWMEM_KB - 1MB.
903 	 *
904 	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
905 	 * We have to add this region, because some systems
906 	 * unconditionally write to 0xb8000 (VGA RAM), and
907 	 * we need to make sure that vmm(4) permits accesses
908 	 * to it. So allocate guest memory for it.
909 	 */
910 	len = MB(1) - (LOWMEM_KB * 1024);
911 	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
912 	vcp->vcp_memranges[1].vmr_size = len;
913 	mem_bytes -= len;
914 
915 	/* If we have less than 2MB remaining, still create a 2nd BIOS area. */
916 	if (mem_bytes <= MB(2)) {
917 		vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END;
918 		vcp->vcp_memranges[2].vmr_size = MB(2);
919 		vcp->vcp_nmemranges = 3;
920 		return;
921 	}
922 
923 	/*
924 	 * Calculate the how to split any remaining memory across the 4GB
925 	 * boundary while making sure we do not place physical memory into
926 	 * MMIO ranges.
927 	 */
928 	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) {
929 		above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1);
930 		above_4g = mem_bytes - above_1m;
931 	} else {
932 		above_1m = mem_bytes;
933 		above_4g = 0;
934 	}
935 
936 	/* Third memory region: area above 1MB to MMIO region */
937 	vcp->vcp_memranges[2].vmr_gpa = MB(1);
938 	vcp->vcp_memranges[2].vmr_size = above_1m;
939 
940 	/* Fourth region: 2nd copy of BIOS above MMIO ending at 4GB */
941 	vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
942 	vcp->vcp_memranges[3].vmr_size = MB(2);
943 
944 	/* Fifth region: any remainder above 4GB */
945 	if (above_4g > 0) {
946 		vcp->vcp_memranges[4].vmr_gpa = GB(4);
947 		vcp->vcp_memranges[4].vmr_size = above_4g;
948 		vcp->vcp_nmemranges = 5;
949 	} else
950 		vcp->vcp_nmemranges = 4;
951 }
952 
953 /*
954  * alloc_guest_mem
955  *
956  * Allocates memory for the guest.
957  * Instead of doing a single allocation with one mmap(), we allocate memory
958  * separately for every range for the following reasons:
959  * - ASLR for the individual ranges
960  * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
961  *   map the single mmap'd userspace memory to the individual guest physical
962  *   memory ranges, the underlying amap of the single mmap'd range would have
963  *   to allocate per-page reference counters. The reason is that the
964  *   individual guest physical ranges would reference the single mmap'd region
965  *   only partially. However, if every guest physical range has its own
966  *   corresponding mmap'd userspace allocation, there are no partial
967  *   references: every guest physical range fully references an mmap'd
968  *   range => no per-page reference counters have to be allocated.
969  *
970  * Return values:
971  *  0: success
972  *  !0: failure - errno indicating the source of the failure
973  */
974 int
975 alloc_guest_mem(struct vm_create_params *vcp)
976 {
977 	void *p;
978 	int ret;
979 	size_t i, j;
980 	struct vm_mem_range *vmr;
981 
982 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
983 		vmr = &vcp->vcp_memranges[i];
984 		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
985 		    MAP_PRIVATE | MAP_ANON, -1, 0);
986 		if (p == MAP_FAILED) {
987 			ret = errno;
988 			for (j = 0; j < i; j++) {
989 				vmr = &vcp->vcp_memranges[j];
990 				munmap((void *)vmr->vmr_va, vmr->vmr_size);
991 			}
992 
993 			return (ret);
994 		}
995 
996 		vmr->vmr_va = (vaddr_t)p;
997 	}
998 
999 	return (0);
1000 }
1001 
1002 /*
1003  * vmm_create_vm
1004  *
1005  * Requests vmm(4) to create a new VM using the supplied creation
1006  * parameters. This operation results in the creation of the in-kernel
1007  * structures for the VM, but does not start the VM's vcpu(s).
1008  *
1009  * Parameters:
1010  *  vcp: vm_create_params struct containing the VM's desired creation
1011  *      configuration
1012  *
1013  * Return values:
1014  *  0: success
1015  *  !0 : ioctl to vmm(4) failed
1016  */
1017 int
1018 vmm_create_vm(struct vm_create_params *vcp)
1019 {
1020 	/* Sanity check arguments */
1021 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
1022 		return (EINVAL);
1023 
1024 	if (vcp->vcp_nmemranges == 0 ||
1025 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1026 		return (EINVAL);
1027 
1028 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
1029 		return (EINVAL);
1030 
1031 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
1032 		return (EINVAL);
1033 
1034 	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1)
1035 		return (errno);
1036 
1037 	return (0);
1038 }
1039 
1040 /*
1041  * init_emulated_hw
1042  *
1043  * Initializes the userspace hardware emulation
1044  */
1045 void
1046 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
1047     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1048 {
1049 	struct vm_create_params *vcp = &vmc->vmc_params;
1050 	size_t i;
1051 	uint64_t memlo, memhi;
1052 
1053 	/* Calculate memory size for NVRAM registers */
1054 	memlo = memhi = 0;
1055 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1056 		if (vcp->vcp_memranges[i].vmr_gpa == MB(1) &&
1057 		    vcp->vcp_memranges[i].vmr_size > (15 * MB(1)))
1058 			memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1));
1059 		else if (vcp->vcp_memranges[i].vmr_gpa == GB(4))
1060 			memhi = vcp->vcp_memranges[i].vmr_size;
1061 	}
1062 
1063 	/* Reset the IO port map */
1064 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1065 
1066 	/* Init i8253 PIT */
1067 	i8253_init(vcp->vcp_id);
1068 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1069 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1070 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1071 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1072 	ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
1073 
1074 	/* Init mc146818 RTC */
1075 	mc146818_init(vcp->vcp_id, memlo, memhi);
1076 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1077 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1078 
1079 	/* Init master and slave PICs */
1080 	i8259_init();
1081 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1082 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1083 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1084 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1085 	ioports_map[ELCR0] = vcpu_exit_elcr;
1086 	ioports_map[ELCR1] = vcpu_exit_elcr;
1087 
1088 	/* Init ns8250 UART */
1089 	ns8250_init(con_fd, vcp->vcp_id);
1090 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1091 		ioports_map[i] = vcpu_exit_com;
1092 
1093 	/* Init QEMU fw_cfg interface */
1094 	fw_cfg_init(vmc);
1095 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1096 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1097 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1098 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1099 
1100 	/* Initialize PCI */
1101 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
1102 		ioports_map[i] = vcpu_exit_pci;
1103 
1104 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1105 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1106 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1107 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1108 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1109 	pci_init();
1110 
1111 	/* Initialize virtio devices */
1112 	virtio_init(current_vm, child_cdrom, child_disks, child_taps);
1113 }
1114 /*
1115  * restore_emulated_hw
1116  *
1117  * Restores the userspace hardware emulation from fd
1118  */
1119 void
1120 restore_emulated_hw(struct vm_create_params *vcp, int fd,
1121     int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
1122 {
1123 	/* struct vm_create_params *vcp = &vmc->vmc_params; */
1124 	int i;
1125 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1126 
1127 	/* Init i8253 PIT */
1128 	i8253_restore(fd, vcp->vcp_id);
1129 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1130 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1131 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1132 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1133 
1134 	/* Init master and slave PICs */
1135 	i8259_restore(fd);
1136 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1137 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1138 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1139 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1140 
1141 	/* Init ns8250 UART */
1142 	ns8250_restore(fd, con_fd, vcp->vcp_id);
1143 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1144 		ioports_map[i] = vcpu_exit_com;
1145 
1146 	/* Init mc146818 RTC */
1147 	mc146818_restore(fd, vcp->vcp_id);
1148 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1149 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1150 
1151 	/* Init QEMU fw_cfg interface */
1152 	fw_cfg_restore(fd);
1153 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1154 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1155 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1156 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1157 
1158 	/* Initialize PCI */
1159 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
1160 		ioports_map[i] = vcpu_exit_pci;
1161 
1162 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1163 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1164 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1165 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1166 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1167 	pci_restore(fd);
1168 	virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
1169 }
1170 
1171 /*
1172  * run_vm
1173  *
1174  * Runs the VM whose creation parameters are specified in vcp
1175  *
1176  * Parameters:
1177  *  child_cdrom: previously-opened child ISO disk file descriptor
1178  *  child_disks: previously-opened child VM disk file file descriptors
1179  *  child_taps: previously-opened child tap file descriptors
1180  *  vmc: vmop_create_params struct containing the VM's desired creation
1181  *      configuration
1182  *  vrs: VCPU register state to initialize
1183  *
1184  * Return values:
1185  *  0: the VM exited normally
1186  *  !0 : the VM exited abnormally or failed to start
1187  */
1188 int
1189 run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK],
1190     int *child_taps, struct vmop_create_params *vmc,
1191     struct vcpu_reg_state *vrs)
1192 {
1193 	struct vm_create_params *vcp = &vmc->vmc_params;
1194 	struct vm_rwregs_params vregsp;
1195 	uint8_t evdone = 0;
1196 	size_t i;
1197 	int ret;
1198 	pthread_t *tid, evtid;
1199 	struct vm_run_params **vrp;
1200 	void *exit_status;
1201 
1202 	if (vcp == NULL)
1203 		return (EINVAL);
1204 
1205 	if (child_cdrom == -1 && strlen(vcp->vcp_cdrom))
1206 		return (EINVAL);
1207 
1208 	if (child_disks == NULL && vcp->vcp_ndisks != 0)
1209 		return (EINVAL);
1210 
1211 	if (child_taps == NULL && vcp->vcp_nnics != 0)
1212 		return (EINVAL);
1213 
1214 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
1215 		return (EINVAL);
1216 
1217 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
1218 		return (EINVAL);
1219 
1220 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
1221 		return (EINVAL);
1222 
1223 	if (vcp->vcp_nmemranges == 0 ||
1224 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1225 		return (EINVAL);
1226 
1227 	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
1228 	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
1229 	if (tid == NULL || vrp == NULL) {
1230 		log_warn("%s: memory allocation error - exiting.",
1231 		    __progname);
1232 		return (ENOMEM);
1233 	}
1234 
1235 	log_debug("%s: initializing hardware for vm %s", __func__,
1236 	    vcp->vcp_name);
1237 
1238 	if (!(current_vm->vm_state & VM_STATE_RECEIVED))
1239 		init_emulated_hw(vmc, child_cdrom, child_disks, child_taps);
1240 
1241 	ret = pthread_mutex_init(&threadmutex, NULL);
1242 	if (ret) {
1243 		log_warn("%s: could not initialize thread state mutex",
1244 		    __func__);
1245 		return (ret);
1246 	}
1247 	ret = pthread_cond_init(&threadcond, NULL);
1248 	if (ret) {
1249 		log_warn("%s: could not initialize thread state "
1250 		    "condition variable", __func__);
1251 		return (ret);
1252 	}
1253 
1254 	mutex_lock(&threadmutex);
1255 
1256 	log_debug("%s: starting vcpu threads for vm %s", __func__,
1257 	    vcp->vcp_name);
1258 
1259 	/*
1260 	 * Create and launch one thread for each VCPU. These threads may
1261 	 * migrate between PCPUs over time; the need to reload CPU state
1262 	 * in such situations is detected and performed by vmm(4) in the
1263 	 * kernel.
1264 	 */
1265 	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
1266 		vrp[i] = malloc(sizeof(struct vm_run_params));
1267 		if (vrp[i] == NULL) {
1268 			log_warn("%s: memory allocation error - "
1269 			    "exiting.", __progname);
1270 			/* caller will exit, so skip freeing */
1271 			return (ENOMEM);
1272 		}
1273 		vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit));
1274 		if (vrp[i]->vrp_exit == NULL) {
1275 			log_warn("%s: memory allocation error - "
1276 			    "exiting.", __progname);
1277 			/* caller will exit, so skip freeing */
1278 			return (ENOMEM);
1279 		}
1280 		vrp[i]->vrp_vm_id = vcp->vcp_id;
1281 		vrp[i]->vrp_vcpu_id = i;
1282 
1283 		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
1284 			log_warnx("%s: cannot reset VCPU %zu - exiting.",
1285 			    __progname, i);
1286 			return (EIO);
1287 		}
1288 
1289 		/* once more because reset_cpu changes regs */
1290 		if (current_vm->vm_state & VM_STATE_RECEIVED) {
1291 			vregsp.vrwp_vm_id = vcp->vcp_id;
1292 			vregsp.vrwp_vcpu_id = i;
1293 			vregsp.vrwp_regs = *vrs;
1294 			vregsp.vrwp_mask = VM_RWREGS_ALL;
1295 			if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS,
1296 			    &vregsp)) == -1) {
1297 				log_warn("%s: writeregs failed", __func__);
1298 				return (ret);
1299 			}
1300 		}
1301 
1302 		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
1303 		if (ret) {
1304 			log_warnx("%s: cannot initialize cond var (%d)",
1305 			    __progname, ret);
1306 			return (ret);
1307 		}
1308 
1309 		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
1310 		if (ret) {
1311 			log_warnx("%s: cannot initialize mtx (%d)",
1312 			    __progname, ret);
1313 			return (ret);
1314 		}
1315 
1316 		ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL);
1317 		if (ret) {
1318 			log_warnx("%s: cannot initialize unpause var (%d)",
1319 			    __progname, ret);
1320 			return (ret);
1321 		}
1322 
1323 		ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL);
1324 		if (ret) {
1325 			log_warnx("%s: cannot initialize unpause mtx (%d)",
1326 			    __progname, ret);
1327 			return (ret);
1328 		}
1329 
1330 		vcpu_hlt[i] = 0;
1331 
1332 		/* Start each VCPU run thread at vcpu_run_loop */
1333 		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
1334 		if (ret) {
1335 			/* caller will _exit after this return */
1336 			ret = errno;
1337 			log_warn("%s: could not create vcpu thread %zu",
1338 			    __func__, i);
1339 			return (ret);
1340 		}
1341 	}
1342 
1343 	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
1344 	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
1345 	if (ret) {
1346 		errno = ret;
1347 		log_warn("%s: could not create event thread", __func__);
1348 		return (ret);
1349 	}
1350 
1351 	for (;;) {
1352 		ret = pthread_cond_wait(&threadcond, &threadmutex);
1353 		if (ret) {
1354 			log_warn("%s: waiting on thread state condition "
1355 			    "variable failed", __func__);
1356 			return (ret);
1357 		}
1358 
1359 		/*
1360 		 * Did a VCPU thread exit with an error? => return the first one
1361 		 */
1362 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1363 			if (vcpu_done[i] == 0)
1364 				continue;
1365 
1366 			if (pthread_join(tid[i], &exit_status)) {
1367 				log_warn("%s: failed to join thread %zd - "
1368 				    "exiting", __progname, i);
1369 				return (EIO);
1370 			}
1371 
1372 			ret = (intptr_t)exit_status;
1373 		}
1374 
1375 		/* Did the event thread exit? => return with an error */
1376 		if (evdone) {
1377 			if (pthread_join(evtid, &exit_status)) {
1378 				log_warn("%s: failed to join event thread - "
1379 				    "exiting", __progname);
1380 				return (EIO);
1381 			}
1382 
1383 			log_warnx("%s: vm %d event thread exited "
1384 			    "unexpectedly", __progname, vcp->vcp_id);
1385 			return (EIO);
1386 		}
1387 
1388 		/* Did all VCPU threads exit successfully? => return */
1389 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1390 			if (vcpu_done[i] == 0)
1391 				break;
1392 		}
1393 		if (i == vcp->vcp_ncpus)
1394 			return (ret);
1395 
1396 		/* Some more threads to wait for, start over */
1397 	}
1398 
1399 	return (ret);
1400 }
1401 
1402 void *
1403 event_thread(void *arg)
1404 {
1405 	uint8_t *donep = arg;
1406 	intptr_t ret;
1407 
1408 	ret = event_dispatch();
1409 
1410 	mutex_lock(&threadmutex);
1411 	*donep = 1;
1412 	pthread_cond_signal(&threadcond);
1413 	mutex_unlock(&threadmutex);
1414 
1415 	return (void *)ret;
1416  }
1417 
1418 /*
1419  * vcpu_run_loop
1420  *
1421  * Runs a single VCPU until vmm(4) requires help handling an exit,
1422  * or the VM terminates.
1423  *
1424  * Parameters:
1425  *  arg: vcpu_run_params for the VCPU being run by this thread
1426  *
1427  * Return values:
1428  *  NULL: the VCPU shutdown properly
1429  *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1430  */
1431 void *
1432 vcpu_run_loop(void *arg)
1433 {
1434 	struct vm_run_params *vrp = (struct vm_run_params *)arg;
1435 	intptr_t ret = 0;
1436 	int irq;
1437 	uint32_t n;
1438 
1439 	vrp->vrp_continue = 0;
1440 	n = vrp->vrp_vcpu_id;
1441 
1442 	for (;;) {
1443 		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1444 
1445 		if (ret) {
1446 			log_warnx("%s: can't lock vcpu run mtx (%d)",
1447 			    __func__, (int)ret);
1448 			return ((void *)ret);
1449 		}
1450 
1451 		/* If we are halted and need to pause, pause */
1452 		if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) {
1453 			ret = pthread_barrier_wait(&vm_pause_barrier);
1454 			if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
1455 				log_warnx("%s: could not wait on pause barrier (%d)",
1456 				    __func__, (int)ret);
1457 				return ((void *)ret);
1458 			}
1459 
1460 			ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]);
1461 			if (ret) {
1462 				log_warnx("%s: can't lock vcpu unpause mtx (%d)",
1463 				    __func__, (int)ret);
1464 				return ((void *)ret);
1465 			}
1466 
1467 			ret = pthread_cond_wait(&vcpu_unpause_cond[n],
1468 			    &vcpu_unpause_mtx[n]);
1469 			if (ret) {
1470 				log_warnx(
1471 				    "%s: can't wait on unpause cond (%d)",
1472 				    __func__, (int)ret);
1473 				break;
1474 			}
1475 			ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]);
1476 			if (ret) {
1477 				log_warnx("%s: can't unlock unpause mtx (%d)",
1478 				    __func__, (int)ret);
1479 				break;
1480 			}
1481 		}
1482 
1483 		/* If we are halted and not paused, wait */
1484 		if (vcpu_hlt[n]) {
1485 			ret = pthread_cond_wait(&vcpu_run_cond[n],
1486 			    &vcpu_run_mtx[n]);
1487 
1488 			if (ret) {
1489 				log_warnx(
1490 				    "%s: can't wait on cond (%d)",
1491 				    __func__, (int)ret);
1492 				(void)pthread_mutex_unlock(
1493 				    &vcpu_run_mtx[n]);
1494 				break;
1495 			}
1496 		}
1497 
1498 		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1499 
1500 		if (ret) {
1501 			log_warnx("%s: can't unlock mutex on cond (%d)",
1502 			    __func__, (int)ret);
1503 			break;
1504 		}
1505 
1506 		if (vrp->vrp_irqready && i8259_is_pending()) {
1507 			irq = i8259_ack();
1508 			vrp->vrp_irq = irq;
1509 		} else
1510 			vrp->vrp_irq = 0xFFFF;
1511 
1512 		/* Still more pending? */
1513 		if (i8259_is_pending()) {
1514 			/*
1515 			 * XXX can probably avoid ioctls here by providing intr
1516 			 * in vrp
1517 			 */
1518 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1519 			    vrp->vrp_vcpu_id, 1)) {
1520 				fatal("can't set INTR");
1521 			}
1522 		} else {
1523 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1524 			    vrp->vrp_vcpu_id, 0)) {
1525 				fatal("can't clear INTR");
1526 			}
1527 		}
1528 
1529 		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) {
1530 			/* If run ioctl failed, exit */
1531 			ret = errno;
1532 			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1533 			    __func__, vrp->vrp_vm_id, n);
1534 			break;
1535 		}
1536 
1537 		/* If the VM is terminating, exit normally */
1538 		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
1539 			ret = (intptr_t)NULL;
1540 			break;
1541 		}
1542 
1543 		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
1544 			/*
1545 			 * vmm(4) needs help handling an exit, handle in
1546 			 * vcpu_exit.
1547 			 */
1548 			ret = vcpu_exit(vrp);
1549 			if (ret)
1550 				break;
1551 		}
1552 	}
1553 
1554 	mutex_lock(&threadmutex);
1555 	vcpu_done[n] = 1;
1556 	pthread_cond_signal(&threadcond);
1557 	mutex_unlock(&threadmutex);
1558 
1559 	return ((void *)ret);
1560 }
1561 
1562 int
1563 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1564 {
1565 	struct vm_intr_params vip;
1566 
1567 	memset(&vip, 0, sizeof(vip));
1568 
1569 	vip.vip_vm_id = vm_id;
1570 	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1571 	vip.vip_intr = intr;
1572 
1573 	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1)
1574 		return (errno);
1575 
1576 	return (0);
1577 }
1578 
1579 /*
1580  * vcpu_exit_pci
1581  *
1582  * Handle all I/O to the emulated PCI subsystem.
1583  *
1584  * Parameters:
1585  *  vrp: vcpu run paramters containing guest state for this exit
1586  *
1587  * Return value:
1588  *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
1589  *      be injected.
1590  */
1591 uint8_t
1592 vcpu_exit_pci(struct vm_run_params *vrp)
1593 {
1594 	struct vm_exit *vei = vrp->vrp_exit;
1595 	uint8_t intr;
1596 
1597 	intr = 0xFF;
1598 
1599 	switch (vei->vei.vei_port) {
1600 	case PCI_MODE1_ADDRESS_REG:
1601 		pci_handle_address_reg(vrp);
1602 		break;
1603 	case PCI_MODE1_DATA_REG:
1604 	case PCI_MODE1_DATA_REG + 1:
1605 	case PCI_MODE1_DATA_REG + 2:
1606 	case PCI_MODE1_DATA_REG + 3:
1607 		pci_handle_data_reg(vrp);
1608 		break;
1609 	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
1610 		intr = pci_handle_io(vrp);
1611 		break;
1612 	default:
1613 		log_warnx("%s: unknown PCI register 0x%llx",
1614 		    __progname, (uint64_t)vei->vei.vei_port);
1615 		break;
1616 	}
1617 
1618 	return (intr);
1619 }
1620 
1621 /*
1622  * vcpu_exit_inout
1623  *
1624  * Handle all I/O exits that need to be emulated in vmd. This includes the
1625  * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
1626  *
1627  * Parameters:
1628  *  vrp: vcpu run parameters containing guest state for this exit
1629  */
1630 void
1631 vcpu_exit_inout(struct vm_run_params *vrp)
1632 {
1633 	struct vm_exit *vei = vrp->vrp_exit;
1634 	uint8_t intr = 0xFF;
1635 
1636 	if (ioports_map[vei->vei.vei_port] != NULL)
1637 		intr = ioports_map[vei->vei.vei_port](vrp);
1638 	else if (vei->vei.vei_dir == VEI_DIR_IN)
1639 			set_return_data(vei, 0xFFFFFFFF);
1640 
1641 	if (intr != 0xFF)
1642 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
1643 }
1644 
1645 /*
1646  * vcpu_exit_eptviolation
1647  *
1648  * handle an EPT Violation
1649  *
1650  * Parameters:
1651  *  vrp: vcpu run parameters containing guest state for this exit
1652  *
1653  * Return values:
1654  *  0: no action required
1655  *  EAGAIN: a protection fault occured, kill the vm.
1656  */
1657 int
1658 vcpu_exit_eptviolation(struct vm_run_params *vrp)
1659 {
1660 	struct vm_exit *ve = vrp->vrp_exit;
1661 
1662 	/*
1663 	 * vmd may be exiting to vmd to handle a pending interrupt
1664 	 * but last exit type may have been VMX_EXIT_EPT_VIOLATION,
1665 	 * check the fault_type to ensure we really are processing
1666 	 * a VMX_EXIT_EPT_VIOLATION.
1667 	 */
1668 	if (ve->vee.vee_fault_type == VEE_FAULT_PROTECT) {
1669 		log_debug("%s: EPT Violation: rip=0x%llx",
1670 		    __progname, vrp->vrp_exit->vrs.vrs_gprs[VCPU_REGS_RIP]);
1671 		return (EAGAIN);
1672 	}
1673 
1674 	return (0);
1675 }
1676 
1677 /*
1678  * vcpu_exit
1679  *
1680  * Handle a vcpu exit. This function is called when it is determined that
1681  * vmm(4) requires the assistance of vmd to support a particular guest
1682  * exit type (eg, accessing an I/O port or device). Guest state is contained
1683  * in 'vrp', and will be resent to vmm(4) on exit completion.
1684  *
1685  * Upon conclusion of handling the exit, the function determines if any
1686  * interrupts should be injected into the guest, and asserts the proper
1687  * IRQ line whose interrupt should be vectored.
1688  *
1689  * Parameters:
1690  *  vrp: vcpu run parameters containing guest state for this exit
1691  *
1692  * Return values:
1693  *  0: the exit was handled successfully
1694  *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
1695  */
1696 int
1697 vcpu_exit(struct vm_run_params *vrp)
1698 {
1699 	int ret;
1700 
1701 	switch (vrp->vrp_exit_reason) {
1702 	case VMX_EXIT_INT_WINDOW:
1703 	case SVM_VMEXIT_VINTR:
1704 	case VMX_EXIT_CPUID:
1705 	case VMX_EXIT_EXTINT:
1706 	case SVM_VMEXIT_INTR:
1707 	case SVM_VMEXIT_NPF:
1708 	case SVM_VMEXIT_MSR:
1709 	case SVM_VMEXIT_CPUID:
1710 		/*
1711 		 * We may be exiting to vmd to handle a pending interrupt but
1712 		 * at the same time the last exit type may have been one of
1713 		 * these. In this case, there's nothing extra to be done
1714 		 * here (and falling through to the default case below results
1715 		 * in more vmd log spam).
1716 		 */
1717 		break;
1718 	case VMX_EXIT_EPT_VIOLATION:
1719 		ret = vcpu_exit_eptviolation(vrp);
1720 		if (ret)
1721 			return (ret);
1722 
1723 		break;
1724 	case VMX_EXIT_IO:
1725 	case SVM_VMEXIT_IOIO:
1726 		vcpu_exit_inout(vrp);
1727 		break;
1728 	case VMX_EXIT_HLT:
1729 	case SVM_VMEXIT_HLT:
1730 		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1731 		if (ret) {
1732 			log_warnx("%s: can't lock vcpu mutex (%d)",
1733 			    __func__, ret);
1734 			return (ret);
1735 		}
1736 		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
1737 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1738 		if (ret) {
1739 			log_warnx("%s: can't unlock vcpu mutex (%d)",
1740 			    __func__, ret);
1741 			return (ret);
1742 		}
1743 		break;
1744 	case VMX_EXIT_TRIPLE_FAULT:
1745 	case SVM_VMEXIT_SHUTDOWN:
1746 		/* reset VM */
1747 		return (EAGAIN);
1748 	default:
1749 		log_debug("%s: unknown exit reason 0x%x",
1750 		    __progname, vrp->vrp_exit_reason);
1751 	}
1752 
1753 	vrp->vrp_continue = 1;
1754 
1755 	return (0);
1756 }
1757 
1758 /*
1759  * find_gpa_range
1760  *
1761  * Search for a contiguous guest physical mem range.
1762  *
1763  * Parameters:
1764  *  vcp: VM create parameters that contain the memory map to search in
1765  *  gpa: the starting guest physical address
1766  *  len: the length of the memory range
1767  *
1768  * Return values:
1769  *  NULL: on failure if there is no memory range as described by the parameters
1770  *  Pointer to vm_mem_range that contains the start of the range otherwise.
1771  */
1772 static struct vm_mem_range *
1773 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
1774 {
1775 	size_t i, n;
1776 	struct vm_mem_range *vmr;
1777 
1778 	/* Find the first vm_mem_range that contains gpa */
1779 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1780 		vmr = &vcp->vcp_memranges[i];
1781 		if (gpa < vmr->vmr_gpa + vmr->vmr_size)
1782 			break;
1783 	}
1784 
1785 	/* No range found. */
1786 	if (i == vcp->vcp_nmemranges)
1787 		return (NULL);
1788 
1789 	/*
1790 	 * vmr may cover the range [gpa, gpa + len) only partly. Make
1791 	 * sure that the following vm_mem_ranges are contiguous and
1792 	 * cover the rest.
1793 	 */
1794 	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
1795 	if (len < n)
1796 		len = 0;
1797 	else
1798 		len -= n;
1799 	gpa = vmr->vmr_gpa + vmr->vmr_size;
1800 	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
1801 		vmr = &vcp->vcp_memranges[i];
1802 		if (gpa != vmr->vmr_gpa)
1803 			return (NULL);
1804 		if (len <= vmr->vmr_size)
1805 			len = 0;
1806 		else
1807 			len -= vmr->vmr_size;
1808 
1809 		gpa = vmr->vmr_gpa + vmr->vmr_size;
1810 	}
1811 
1812 	if (len != 0)
1813 		return (NULL);
1814 
1815 	return (vmr);
1816 }
1817 
1818 /*
1819  * write_mem
1820  *
1821  * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
1822  *
1823  * Parameters:
1824  *  dst: the destination paddr_t in the guest VM
1825  *  buf: data to copy (or NULL to zero the data)
1826  *  len: number of bytes to copy
1827  *
1828  * Return values:
1829  *  0: success
1830  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1831  *      exist in the guest.
1832  */
1833 int
1834 write_mem(paddr_t dst, const void *buf, size_t len)
1835 {
1836 	const char *from = buf;
1837 	char *to;
1838 	size_t n, off;
1839 	struct vm_mem_range *vmr;
1840 
1841 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
1842 	if (vmr == NULL) {
1843 		errno = EINVAL;
1844 		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
1845 		    "len = 0x%zx", __func__, dst, len);
1846 		return (EINVAL);
1847 	}
1848 
1849 	off = dst - vmr->vmr_gpa;
1850 	while (len != 0) {
1851 		n = vmr->vmr_size - off;
1852 		if (len < n)
1853 			n = len;
1854 
1855 		to = (char *)vmr->vmr_va + off;
1856 		if (buf == NULL)
1857 			memset(to, 0, n);
1858 		else {
1859 			memcpy(to, from, n);
1860 			from += n;
1861 		}
1862 		len -= n;
1863 		off = 0;
1864 		vmr++;
1865 	}
1866 
1867 	return (0);
1868 }
1869 
1870 /*
1871  * read_mem
1872  *
1873  * Reads memory at guest paddr 'src' into 'buf'.
1874  *
1875  * Parameters:
1876  *  src: the source paddr_t in the guest VM to read from.
1877  *  buf: destination (local) buffer
1878  *  len: number of bytes to read
1879  *
1880  * Return values:
1881  *  0: success
1882  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1883  *      exist in the guest.
1884  */
1885 int
1886 read_mem(paddr_t src, void *buf, size_t len)
1887 {
1888 	char *from, *to = buf;
1889 	size_t n, off;
1890 	struct vm_mem_range *vmr;
1891 
1892 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
1893 	if (vmr == NULL) {
1894 		errno = EINVAL;
1895 		log_warn("%s: failed - invalid memory range src = 0x%lx, "
1896 		    "len = 0x%zx", __func__, src, len);
1897 		return (EINVAL);
1898 	}
1899 
1900 	off = src - vmr->vmr_gpa;
1901 	while (len != 0) {
1902 		n = vmr->vmr_size - off;
1903 		if (len < n)
1904 			n = len;
1905 
1906 		from = (char *)vmr->vmr_va + off;
1907 		memcpy(to, from, n);
1908 
1909 		to += n;
1910 		len -= n;
1911 		off = 0;
1912 		vmr++;
1913 	}
1914 
1915 	return (0);
1916 }
1917 
1918 /*
1919  * vcpu_assert_pic_irq
1920  *
1921  * Injects the specified IRQ on the supplied vcpu/vm
1922  *
1923  * Parameters:
1924  *  vm_id: VM ID to inject to
1925  *  vcpu_id: VCPU ID to inject to
1926  *  irq: IRQ to inject
1927  */
1928 void
1929 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1930 {
1931 	int ret;
1932 
1933 	i8259_assert_irq(irq);
1934 
1935 	if (i8259_is_pending()) {
1936 		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
1937 			fatalx("%s: can't assert INTR", __func__);
1938 
1939 		ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
1940 		if (ret)
1941 			fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
1942 
1943 		vcpu_hlt[vcpu_id] = 0;
1944 		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
1945 		if (ret)
1946 			fatalx("%s: can't signal (%d)", __func__, ret);
1947 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
1948 		if (ret)
1949 			fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
1950 	}
1951 }
1952 
1953 /*
1954  * vcpu_deassert_pic_irq
1955  *
1956  * Clears the specified IRQ on the supplied vcpu/vm
1957  *
1958  * Parameters:
1959  *  vm_id: VM ID to clear in
1960  *  vcpu_id: VCPU ID to clear in
1961  *  irq: IRQ to clear
1962  */
1963 void
1964 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1965 {
1966 	i8259_deassert_irq(irq);
1967 
1968 	if (!i8259_is_pending()) {
1969 		if (vcpu_pic_intr(vm_id, vcpu_id, 0))
1970 			fatalx("%s: can't deassert INTR for vm_id %d, "
1971 			    "vcpu_id %d", __func__, vm_id, vcpu_id);
1972 	}
1973 }
1974 
1975 /*
1976  * fd_hasdata
1977  *
1978  * Determines if data can be read from a file descriptor.
1979  *
1980  * Parameters:
1981  *  fd: the fd to check
1982  *
1983  * Return values:
1984  *  1 if data can be read from an fd, or 0 otherwise.
1985  */
1986 int
1987 fd_hasdata(int fd)
1988 {
1989 	struct pollfd pfd[1];
1990 	int nready, hasdata = 0;
1991 
1992 	pfd[0].fd = fd;
1993 	pfd[0].events = POLLIN;
1994 	nready = poll(pfd, 1, 0);
1995 	if (nready == -1)
1996 		log_warn("checking file descriptor for data failed");
1997 	else if (nready == 1 && pfd[0].revents & POLLIN)
1998 		hasdata = 1;
1999 	return (hasdata);
2000 }
2001 
2002 /*
2003  * mutex_lock
2004  *
2005  * Wrapper function for pthread_mutex_lock that does error checking and that
2006  * exits on failure
2007  */
2008 void
2009 mutex_lock(pthread_mutex_t *m)
2010 {
2011 	int ret;
2012 
2013 	ret = pthread_mutex_lock(m);
2014 	if (ret) {
2015 		errno = ret;
2016 		fatal("could not acquire mutex");
2017 	}
2018 }
2019 
2020 /*
2021  * mutex_unlock
2022  *
2023  * Wrapper function for pthread_mutex_unlock that does error checking and that
2024  * exits on failure
2025  */
2026 void
2027 mutex_unlock(pthread_mutex_t *m)
2028 {
2029 	int ret;
2030 
2031 	ret = pthread_mutex_unlock(m);
2032 	if (ret) {
2033 		errno = ret;
2034 		fatal("could not release mutex");
2035 	}
2036 }
2037 
2038 /*
2039  * set_return_data
2040  *
2041  * Utility function for manipulating register data in vm exit info structs. This
2042  * function ensures that the data is copied to the vei->vei.vei_data field with
2043  * the proper size for the operation being performed.
2044  *
2045  * Parameters:
2046  *  vei: exit information
2047  *  data: return data
2048  */
2049 void
2050 set_return_data(struct vm_exit *vei, uint32_t data)
2051 {
2052 	switch (vei->vei.vei_size) {
2053 	case 1:
2054 		vei->vei.vei_data &= ~0xFF;
2055 		vei->vei.vei_data |= (uint8_t)data;
2056 		break;
2057 	case 2:
2058 		vei->vei.vei_data &= ~0xFFFF;
2059 		vei->vei.vei_data |= (uint16_t)data;
2060 		break;
2061 	case 4:
2062 		vei->vei.vei_data = data;
2063 		break;
2064 	}
2065 }
2066 
2067 /*
2068  * get_input_data
2069  *
2070  * Utility function for manipulating register data in vm exit info
2071  * structs. This function ensures that the data is copied from the
2072  * vei->vei.vei_data field with the proper size for the operation being
2073  * performed.
2074  *
2075  * Parameters:
2076  *  vei: exit information
2077  *  data: location to store the result
2078  */
2079 void
2080 get_input_data(struct vm_exit *vei, uint32_t *data)
2081 {
2082 	switch (vei->vei.vei_size) {
2083 	case 1:
2084 		*data &= 0xFFFFFF00;
2085 		*data |= (uint8_t)vei->vei.vei_data;
2086 		break;
2087 	case 2:
2088 		*data &= 0xFFFF0000;
2089 		*data |= (uint16_t)vei->vei.vei_data;
2090 		break;
2091 	case 4:
2092 		*data = vei->vei.vei_data;
2093 		break;
2094 	default:
2095 		log_warnx("%s: invalid i/o size %d", __func__,
2096 		    vei->vei.vei_size);
2097 	}
2098 
2099 }
2100 
2101 /*
2102  * translate_gva
2103  *
2104  * Translates a guest virtual address to a guest physical address by walking
2105  * the currently active page table (if needed).
2106  *
2107  * Note - this function can possibly alter the supplied VCPU state.
2108  *  Specifically, it may inject exceptions depending on the current VCPU
2109  *  configuration, and may alter %cr2 on #PF. Consequently, this function
2110  *  should only be used as part of instruction emulation.
2111  *
2112  * Parameters:
2113  *  exit: The VCPU this translation should be performed for (guest MMU settings
2114  *   are gathered from this VCPU)
2115  *  va: virtual address to translate
2116  *  pa: pointer to paddr_t variable that will receive the translated physical
2117  *   address. 'pa' is unchanged on error.
2118  *  mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
2119  *   the address should be translated
2120  *
2121  * Return values:
2122  *  0: the address was successfully translated - 'pa' contains the physical
2123  *     address currently mapped by 'va'.
2124  *  EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
2125  *     and %cr2 set in the vcpu structure.
2126  *  EINVAL: an error occurred reading paging table structures
2127  */
2128 int
2129 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
2130 {
2131 	int level, shift, pdidx;
2132 	uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
2133 	uint64_t shift_width, pte_size;
2134 	struct vcpu_reg_state *vrs;
2135 
2136 	vrs = &exit->vrs;
2137 
2138 	if (!pa)
2139 		return (EINVAL);
2140 
2141 	if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
2142 		log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
2143 		*pa = va;
2144 		return (0);
2145 	}
2146 
2147 	pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
2148 
2149 	log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
2150 	    vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
2151 
2152 	if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
2153 		if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
2154 			pte_size = sizeof(uint64_t);
2155 			shift_width = 9;
2156 
2157 			if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
2158 				/* 4 level paging */
2159 				level = 4;
2160 				mask = L4_MASK;
2161 				shift = L4_SHIFT;
2162 			} else {
2163 				/* 32 bit with PAE paging */
2164 				level = 3;
2165 				mask = L3_MASK;
2166 				shift = L3_SHIFT;
2167 			}
2168 		} else {
2169 			/* 32 bit paging */
2170 			level = 2;
2171 			shift_width = 10;
2172 			mask = 0xFFC00000;
2173 			shift = 22;
2174 			pte_size = sizeof(uint32_t);
2175 		}
2176 	} else
2177 		return (EINVAL);
2178 
2179 	/* XXX: Check for R bit in segment selector and set A bit */
2180 
2181 	for (;level > 0; level--) {
2182 		pdidx = (va & mask) >> shift;
2183 		pte_paddr = (pt_paddr) + (pdidx * pte_size);
2184 
2185 		log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
2186 		    level, pte_paddr);
2187 		if (read_mem(pte_paddr, &pte, pte_size)) {
2188 			log_warn("%s: failed to read pte", __func__);
2189 			return (EFAULT);
2190 		}
2191 
2192 		log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
2193 		    pte);
2194 
2195 		/* XXX: Set CR2  */
2196 		if (!(pte & PG_V))
2197 			return (EFAULT);
2198 
2199 		/* XXX: Check for SMAP */
2200 		if ((mode == PROT_WRITE) && !(pte & PG_RW))
2201 			return (EPERM);
2202 
2203 		if ((exit->cpl > 0) && !(pte & PG_u))
2204 			return (EPERM);
2205 
2206 		pte = pte | PG_U;
2207 		if (mode == PROT_WRITE)
2208 			pte = pte | PG_M;
2209 		if (write_mem(pte_paddr, &pte, pte_size)) {
2210 			log_warn("%s: failed to write back flags to pte",
2211 			    __func__);
2212 			return (EIO);
2213 		}
2214 
2215 		/* XXX: EINVAL if in 32bit and  PG_PS is 1 but CR4.PSE is 0 */
2216 		if (pte & PG_PS)
2217 			break;
2218 
2219 		if (level > 1) {
2220 			pt_paddr = pte & PG_FRAME;
2221 			shift -= shift_width;
2222 			mask = mask >> shift_width;
2223 		}
2224 	}
2225 
2226 	low_mask = (1 << shift) - 1;
2227 	high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
2228 	*pa = (pte & high_mask) | (va & low_mask);
2229 
2230 	log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
2231 
2232 	return (0);
2233 }
2234 
2235 /*
2236  * vm_pipe_init
2237  *
2238  * Initialize a vm_dev_pipe, setting up its file descriptors and its
2239  * event structure with the given callback.
2240  *
2241  * Parameters:
2242  *  p: pointer to vm_dev_pipe struct to initizlize
2243  *  cb: callback to use for READ events on the read end of the pipe
2244  */
2245 void
2246 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *))
2247 {
2248 	int ret;
2249 	int fds[2];
2250 
2251 	memset(p, 0, sizeof(struct vm_dev_pipe));
2252 
2253 	ret = pipe(fds);
2254 	if (ret)
2255 		fatal("failed to create vm_dev_pipe pipe");
2256 
2257 	p->read = fds[0];
2258 	p->write = fds[1];
2259 
2260 	event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL);
2261 }
2262 
2263 /*
2264  * vm_pipe_send
2265  *
2266  * Send a message to an emulated device vie the provided vm_dev_pipe.
2267  *
2268  * Parameters:
2269  *  p: pointer to initialized vm_dev_pipe
2270  *  msg: message to send in the channel
2271  */
2272 void
2273 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg)
2274 {
2275 	size_t n;
2276 	n = write(p->write, &msg, sizeof(msg));
2277 	if (n != sizeof(msg))
2278 		fatal("failed to write to device pipe");
2279 }
2280 
2281 /*
2282  * vm_pipe_recv
2283  *
2284  * Receive a message for an emulated device via the provided vm_dev_pipe.
2285  * Returns the message value, otherwise will exit on failure.
2286  *
2287  * Parameters:
2288  *  p: pointer to initialized vm_dev_pipe
2289  *
2290  * Return values:
2291  *  a value of enum pipe_msg_type or fatal exit on read(2) error
2292  */
2293 enum pipe_msg_type
2294 vm_pipe_recv(struct vm_dev_pipe *p)
2295 {
2296 	size_t n;
2297 	enum pipe_msg_type msg;
2298 	n = read(p->read, &msg, sizeof(msg));
2299 	if (n != sizeof(msg))
2300 		fatal("failed to read from device pipe");
2301 
2302 	return msg;
2303 }
2304