xref: /openbsd-src/usr.sbin/vmd/vm.c (revision c1a45aed656e7d5627c30c92421893a76f370ccb)
1 /*	$OpenBSD: vm.c,v 1.68 2022/03/01 21:46:19 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* PAGE_SIZE */
20 #include <sys/types.h>
21 #include <sys/ioctl.h>
22 #include <sys/queue.h>
23 #include <sys/wait.h>
24 #include <sys/uio.h>
25 #include <sys/stat.h>
26 #include <sys/socket.h>
27 #include <sys/time.h>
28 #include <sys/mman.h>
29 #include <sys/resource.h>
30 
31 #include <dev/ic/i8253reg.h>
32 #include <dev/isa/isareg.h>
33 #include <dev/pci/pcireg.h>
34 
35 #include <machine/psl.h>
36 #include <machine/pte.h>
37 #include <machine/specialreg.h>
38 #include <machine/vmmvar.h>
39 
40 #include <net/if.h>
41 
42 #include <errno.h>
43 #include <event.h>
44 #include <fcntl.h>
45 #include <imsg.h>
46 #include <limits.h>
47 #include <poll.h>
48 #include <pthread.h>
49 #include <stddef.h>
50 #include <stdio.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <unistd.h>
54 #include <util.h>
55 
56 #include "atomicio.h"
57 #include "fw_cfg.h"
58 #include "i8253.h"
59 #include "i8259.h"
60 #include "loadfile.h"
61 #include "mc146818.h"
62 #include "ns8250.h"
63 #include "pci.h"
64 #include "virtio.h"
65 #include "vmd.h"
66 #include "vmm.h"
67 
68 io_fn_t ioports_map[MAX_PORTS];
69 
70 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
71     struct vmop_create_params *, struct vcpu_reg_state *);
72 void vm_dispatch_vmm(int, short, void *);
73 void *event_thread(void *);
74 void *vcpu_run_loop(void *);
75 int vcpu_exit(struct vm_run_params *);
76 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
77 void create_memory_map(struct vm_create_params *);
78 int alloc_guest_mem(struct vm_create_params *);
79 int vmm_create_vm(struct vm_create_params *);
80 void init_emulated_hw(struct vmop_create_params *, int,
81     int[][VM_MAX_BASE_PER_DISK], int *);
82 void restore_emulated_hw(struct vm_create_params *, int, int *,
83     int[][VM_MAX_BASE_PER_DISK],int);
84 void vcpu_exit_inout(struct vm_run_params *);
85 int vcpu_exit_eptviolation(struct vm_run_params *);
86 uint8_t vcpu_exit_pci(struct vm_run_params *);
87 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
88 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
89 int send_vm(int, struct vm_create_params *);
90 int dump_send_header(int);
91 int dump_vmr(int , struct vm_mem_range *);
92 int dump_mem(int, struct vm_create_params *);
93 void restore_vmr(int, struct vm_mem_range *);
94 void restore_mem(int, struct vm_create_params *);
95 int restore_vm_params(int, struct vm_create_params *);
96 void pause_vm(struct vm_create_params *);
97 void unpause_vm(struct vm_create_params *);
98 
99 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
100 
101 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
102     size_t);
103 
104 int con_fd;
105 struct vmd_vm *current_vm;
106 
107 extern struct vmd *env;
108 
109 extern char *__progname;
110 
111 pthread_mutex_t threadmutex;
112 pthread_cond_t threadcond;
113 
114 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
115 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
116 pthread_barrier_t vm_pause_barrier;
117 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM];
118 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM];
119 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
120 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
121 
122 /*
123  * Represents a standard register set for an OS to be booted
124  * as a flat 64 bit address space.
125  *
126  * NOT set here are:
127  *  RIP
128  *  RSP
129  *  GDTR BASE
130  *
131  * Specific bootloaders should clone this structure and override
132  * those fields as needed.
133  *
134  * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
135  *        features of the CPU in use.
136  */
137 static const struct vcpu_reg_state vcpu_init_flat64 = {
138 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
139 	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
140 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
141 	.vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
142 	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
143 	.vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
144 	.vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
145 	.vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
146 	.vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
147 	.vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
148 	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
149 	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
150 	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
151 	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
152 	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
153 	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
154 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
155 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
156 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
157 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
158 	.vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
159 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
160 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
161 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
162 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
163 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
164 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
165 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
166 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
167 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
168 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
169 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
170 	.vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
171 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
172 };
173 
174 /*
175  * Represents a standard register set for an BIOS to be booted
176  * as a flat 16 bit address space.
177  */
178 static const struct vcpu_reg_state vcpu_init_flat16 = {
179 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
180 	.vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
181 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
182 	.vrs_crs[VCPU_REGS_CR0] = 0x60000010,
183 	.vrs_crs[VCPU_REGS_CR3] = 0,
184 	.vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
185 	.vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
186 	.vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
187 	.vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
188 	.vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
189 	.vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
190 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
191 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
192 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
193 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
194 	.vrs_msrs[VCPU_REGS_EFER] = 0ULL,
195 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
196 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
197 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
198 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
199 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
200 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
201 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
202 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
203 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
204 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
205 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
206 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
207 };
208 
209 /*
210  * loadfile_bios
211  *
212  * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
213  * directly into memory.
214  *
215  * Parameters:
216  *  fp: file of a kernel file to load
217  *  size: uncompressed size of the image
218  *  (out) vrs: register state to set on init for this kernel
219  *
220  * Return values:
221  *  0 if successful
222  *  various error codes returned from read(2) or loadelf functions
223  */
224 int
225 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
226 {
227 	off_t	 off;
228 
229 	/* Set up a "flat 16 bit" register state for BIOS */
230 	memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
231 
232 	/* Seek to the beginning of the BIOS image */
233 	if (gzseek(fp, 0, SEEK_SET) == -1)
234 		return (-1);
235 
236 	/* The BIOS image must end at 1M */
237 	if ((off = 1048576 - size) < 0)
238 		return (-1);
239 
240 	/* Read BIOS image into memory */
241 	if (mread(fp, off, size) != (size_t)size) {
242 		errno = EIO;
243 		return (-1);
244 	}
245 
246 	log_debug("%s: loaded BIOS image", __func__);
247 
248 	return (0);
249 }
250 
251 /*
252  * start_vm
253  *
254  * After forking a new VM process, starts the new VM with the creation
255  * parameters supplied (in the incoming vm->vm_params field). This
256  * function performs a basic sanity check on the incoming parameters
257  * and then performs the following steps to complete the creation of the VM:
258  *
259  * 1. validates and create the new VM
260  * 2. opens the imsg control channel to the parent and drops more privilege
261  * 3. drops additional privleges by calling pledge(2)
262  * 4. loads the kernel from the disk image or file descriptor
263  * 5. runs the VM's VCPU loops.
264  *
265  * Parameters:
266  *  vm: The VM data structure that is including the VM create parameters.
267  *  fd: The imsg socket that is connected to the parent process.
268  *
269  * Return values:
270  *  0: success
271  *  !0 : failure - typically an errno indicating the source of the failure
272  */
273 int
274 start_vm(struct vmd_vm *vm, int fd)
275 {
276 	struct vmop_create_params *vmc = &vm->vm_params;
277 	struct vm_create_params	*vcp = &vmc->vmc_params;
278 	struct vcpu_reg_state	 vrs;
279 	int			 nicfds[VMM_MAX_NICS_PER_VM];
280 	int			 ret;
281 	gzFile			 fp;
282 	size_t			 i;
283 	struct vm_rwregs_params  vrp;
284 	struct stat		 sb;
285 
286 	/* Child */
287 	setproctitle("%s", vcp->vcp_name);
288 	log_procinit(vcp->vcp_name);
289 
290 	if (!(vm->vm_state & VM_STATE_RECEIVED))
291 		create_memory_map(vcp);
292 
293 	ret = alloc_guest_mem(vcp);
294 
295 	if (ret) {
296 		struct rlimit lim;
297 		char buf[FMT_SCALED_STRSIZE];
298 		if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) {
299 			if (fmt_scaled(lim.rlim_cur, buf) == 0)
300 				fatalx("could not allocate guest memory (data "
301 				    "limit is %s)", buf);
302 		}
303 		errno = ret;
304 		fatal("could not allocate guest memory");
305 	}
306 
307 	ret = vmm_create_vm(vcp);
308 	current_vm = vm;
309 
310 	/* send back the kernel-generated vm id (0 on error) */
311 	if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
312 	    sizeof(vcp->vcp_id))
313 		fatal("failed to send created vm id to vmm process");
314 
315 	if (ret) {
316 		errno = ret;
317 		fatal("create vmm ioctl failed - exiting");
318 	}
319 
320 	/*
321 	 * pledge in the vm processes:
322 	 * stdio - for malloc and basic I/O including events.
323 	 * recvfd - for send/recv.
324 	 * vmm - for the vmm ioctls and operations.
325 	 */
326 	if (pledge("stdio vmm recvfd", NULL) == -1)
327 		fatal("pledge");
328 
329 	if (vm->vm_state & VM_STATE_RECEIVED) {
330 		ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp));
331 		if (ret != sizeof(vrp))
332 			fatal("received incomplete vrp - exiting");
333 		vrs = vrp.vrwp_regs;
334 	} else {
335 		/*
336 		 * Set up default "flat 64 bit" register state - RIP,
337 		 * RSP, and GDT info will be set in bootloader
338 		 */
339 		memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs));
340 
341 		/* Find and open kernel image */
342 		if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL)
343 			fatalx("failed to open kernel - exiting");
344 
345 		/* Load kernel image */
346 		ret = loadfile_elf(fp, vcp, &vrs, vmc->vmc_bootdevice);
347 
348 		/*
349 		 * Try BIOS as a fallback (only if it was provided as an image
350 		 * with vm->vm_kernel and the file is not compressed)
351 		 */
352 		if (ret && errno == ENOEXEC && vm->vm_kernel != -1 &&
353 		    gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
354 			ret = loadfile_bios(fp, sb.st_size, &vrs);
355 
356 		if (ret)
357 			fatal("failed to load kernel or BIOS - exiting");
358 
359 		gzclose(fp);
360 	}
361 
362 	if (vm->vm_kernel != -1)
363 		close(vm->vm_kernel);
364 
365 	con_fd = vm->vm_tty;
366 	if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
367 		fatal("failed to set nonblocking mode on console");
368 
369 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
370 		nicfds[i] = vm->vm_ifs[i].vif_fd;
371 
372 	event_init();
373 
374 	if (vm->vm_state & VM_STATE_RECEIVED) {
375 		restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
376 		    vm->vm_disks, vm->vm_cdrom);
377 		restore_mem(vm->vm_receive_fd, vcp);
378 		if (restore_vm_params(vm->vm_receive_fd, vcp))
379 			fatal("restore vm params failed");
380 		unpause_vm(vcp);
381 	}
382 
383 	if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
384 		fatal("setup vm pipe");
385 
386 	/* Execute the vcpu run loop(s) for this VM */
387 	ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs);
388 
389 	/* Ensure that any in-flight data is written back */
390 	virtio_shutdown(vm);
391 
392 	return (ret);
393 }
394 
395 /*
396  * vm_dispatch_vmm
397  *
398  * imsg callback for messages that are received from the vmm parent process.
399  */
400 void
401 vm_dispatch_vmm(int fd, short event, void *arg)
402 {
403 	struct vmd_vm		*vm = arg;
404 	struct vmop_result	 vmr;
405 	struct vmop_addr_result	 var;
406 	struct imsgev		*iev = &vm->vm_iev;
407 	struct imsgbuf		*ibuf = &iev->ibuf;
408 	struct imsg		 imsg;
409 	ssize_t			 n;
410 	int			 verbose;
411 
412 	if (event & EV_READ) {
413 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
414 			fatal("%s: imsg_read", __func__);
415 		if (n == 0)
416 			_exit(0);
417 	}
418 
419 	if (event & EV_WRITE) {
420 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
421 			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
422 		if (n == 0)
423 			_exit(0);
424 	}
425 
426 	for (;;) {
427 		if ((n = imsg_get(ibuf, &imsg)) == -1)
428 			fatal("%s: imsg_get", __func__);
429 		if (n == 0)
430 			break;
431 
432 #if DEBUG > 1
433 		log_debug("%s: got imsg %d from %s",
434 		    __func__, imsg.hdr.type,
435 		    vm->vm_params.vmc_params.vcp_name);
436 #endif
437 
438 		switch (imsg.hdr.type) {
439 		case IMSG_CTL_VERBOSE:
440 			IMSG_SIZE_CHECK(&imsg, &verbose);
441 			memcpy(&verbose, imsg.data, sizeof(verbose));
442 			log_setverbose(verbose);
443 			break;
444 		case IMSG_VMDOP_VM_SHUTDOWN:
445 			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
446 				_exit(0);
447 			break;
448 		case IMSG_VMDOP_VM_REBOOT:
449 			if (vmmci_ctl(VMMCI_REBOOT) == -1)
450 				_exit(0);
451 			break;
452 		case IMSG_VMDOP_PAUSE_VM:
453 			vmr.vmr_result = 0;
454 			vmr.vmr_id = vm->vm_vmid;
455 			pause_vm(&vm->vm_params.vmc_params);
456 			imsg_compose_event(&vm->vm_iev,
457 			    IMSG_VMDOP_PAUSE_VM_RESPONSE,
458 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
459 			    sizeof(vmr));
460 			break;
461 		case IMSG_VMDOP_UNPAUSE_VM:
462 			vmr.vmr_result = 0;
463 			vmr.vmr_id = vm->vm_vmid;
464 			unpause_vm(&vm->vm_params.vmc_params);
465 			imsg_compose_event(&vm->vm_iev,
466 			    IMSG_VMDOP_UNPAUSE_VM_RESPONSE,
467 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
468 			    sizeof(vmr));
469 			break;
470 		case IMSG_VMDOP_SEND_VM_REQUEST:
471 			vmr.vmr_id = vm->vm_vmid;
472 			vmr.vmr_result = send_vm(imsg.fd,
473 			    &vm->vm_params.vmc_params);
474 			imsg_compose_event(&vm->vm_iev,
475 			    IMSG_VMDOP_SEND_VM_RESPONSE,
476 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
477 			    sizeof(vmr));
478 			if (!vmr.vmr_result) {
479 				imsg_flush(&current_vm->vm_iev.ibuf);
480 				_exit(0);
481 			}
482 			break;
483 		case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
484 			IMSG_SIZE_CHECK(&imsg, &var);
485 			memcpy(&var, imsg.data, sizeof(var));
486 
487 			log_debug("%s: received tap addr %s for nic %d",
488 			    vm->vm_params.vmc_params.vcp_name,
489 			    ether_ntoa((void *)var.var_addr), var.var_nic_idx);
490 
491 			vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr);
492 			break;
493 		default:
494 			fatalx("%s: got invalid imsg %d from %s",
495 			    __func__, imsg.hdr.type,
496 			    vm->vm_params.vmc_params.vcp_name);
497 		}
498 		imsg_free(&imsg);
499 	}
500 	imsg_event_add(iev);
501 }
502 
503 /*
504  * vm_shutdown
505  *
506  * Tell the vmm parent process to shutdown or reboot the VM and exit.
507  */
508 __dead void
509 vm_shutdown(unsigned int cmd)
510 {
511 	switch (cmd) {
512 	case VMMCI_NONE:
513 	case VMMCI_SHUTDOWN:
514 		(void)imsg_compose_event(&current_vm->vm_iev,
515 		    IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0);
516 		break;
517 	case VMMCI_REBOOT:
518 		(void)imsg_compose_event(&current_vm->vm_iev,
519 		    IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0);
520 		break;
521 	default:
522 		fatalx("invalid vm ctl command: %d", cmd);
523 	}
524 	imsg_flush(&current_vm->vm_iev.ibuf);
525 
526 	_exit(0);
527 }
528 
529 int
530 send_vm(int fd, struct vm_create_params *vcp)
531 {
532 	struct vm_rwregs_params	   vrp;
533 	struct vm_rwvmparams_params vpp;
534 	struct vmop_create_params *vmc;
535 	struct vm_terminate_params vtp;
536 	unsigned int		   flags = 0;
537 	unsigned int		   i;
538 	int			   ret = 0;
539 	size_t			   sz;
540 
541 	if (dump_send_header(fd)) {
542 		log_info("%s: failed to send vm dump header", __func__);
543 		goto err;
544 	}
545 
546 	pause_vm(vcp);
547 
548 	vmc = calloc(1, sizeof(struct vmop_create_params));
549 	if (vmc == NULL) {
550 		log_warn("%s: calloc error geting vmc", __func__);
551 		ret = -1;
552 		goto err;
553 	}
554 
555 	flags |= VMOP_CREATE_MEMORY;
556 	memcpy(&vmc->vmc_params, &current_vm->vm_params, sizeof(struct
557 	    vmop_create_params));
558 	vmc->vmc_flags = flags;
559 	vrp.vrwp_vm_id = vcp->vcp_id;
560 	vrp.vrwp_mask = VM_RWREGS_ALL;
561 	vpp.vpp_mask = VM_RWVMPARAMS_ALL;
562 	vpp.vpp_vm_id = vcp->vcp_id;
563 
564 	sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params));
565 	if (sz != sizeof(struct vmop_create_params)) {
566 		ret = -1;
567 		goto err;
568 	}
569 
570 	for (i = 0; i < vcp->vcp_ncpus; i++) {
571 		vrp.vrwp_vcpu_id = i;
572 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) {
573 			log_warn("%s: readregs failed", __func__);
574 			goto err;
575 		}
576 
577 		sz = atomicio(vwrite, fd, &vrp,
578 		    sizeof(struct vm_rwregs_params));
579 		if (sz != sizeof(struct vm_rwregs_params)) {
580 			log_warn("%s: dumping registers failed", __func__);
581 			ret = -1;
582 			goto err;
583 		}
584 	}
585 
586 	if ((ret = i8253_dump(fd)))
587 		goto err;
588 	if ((ret = i8259_dump(fd)))
589 		goto err;
590 	if ((ret = ns8250_dump(fd)))
591 		goto err;
592 	if ((ret = mc146818_dump(fd)))
593 		goto err;
594 	if ((ret = fw_cfg_dump(fd)))
595 		goto err;
596 	if ((ret = pci_dump(fd)))
597 		goto err;
598 	if ((ret = virtio_dump(fd)))
599 		goto err;
600 	if ((ret = dump_mem(fd, vcp)))
601 		goto err;
602 
603 	for (i = 0; i < vcp->vcp_ncpus; i++) {
604 		vpp.vpp_vcpu_id = i;
605 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) {
606 			log_warn("%s: readvmparams failed", __func__);
607 			goto err;
608 		}
609 
610 		sz = atomicio(vwrite, fd, &vpp,
611 		    sizeof(struct vm_rwvmparams_params));
612 		if (sz != sizeof(struct vm_rwvmparams_params)) {
613 			log_warn("%s: dumping vm params failed", __func__);
614 			ret = -1;
615 			goto err;
616 		}
617 	}
618 
619 	vtp.vtp_vm_id = vcp->vcp_id;
620 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) {
621 		log_warnx("%s: term IOC error: %d, %d", __func__,
622 		    errno, ENOENT);
623 	}
624 err:
625 	close(fd);
626 	if (ret)
627 		unpause_vm(vcp);
628 	return ret;
629 }
630 
631 int
632 dump_send_header(int fd) {
633 	struct vm_dump_header	   vmh;
634 	int			   i;
635 
636 	memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
637 	    sizeof(vmh.vmh_signature));
638 
639 	vmh.vmh_cpuids[0].code = 0x00;
640 	vmh.vmh_cpuids[0].leaf = 0x00;
641 
642 	vmh.vmh_cpuids[1].code = 0x01;
643 	vmh.vmh_cpuids[1].leaf = 0x00;
644 
645 	vmh.vmh_cpuids[2].code = 0x07;
646 	vmh.vmh_cpuids[2].leaf = 0x00;
647 
648 	vmh.vmh_cpuids[3].code = 0x0d;
649 	vmh.vmh_cpuids[3].leaf = 0x00;
650 
651 	vmh.vmh_cpuids[4].code = 0x80000001;
652 	vmh.vmh_cpuids[4].leaf = 0x00;
653 
654 	vmh.vmh_version = VM_DUMP_VERSION;
655 
656 	for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
657 		CPUID_LEAF(vmh.vmh_cpuids[i].code,
658 		    vmh.vmh_cpuids[i].leaf,
659 		    vmh.vmh_cpuids[i].a,
660 		    vmh.vmh_cpuids[i].b,
661 		    vmh.vmh_cpuids[i].c,
662 		    vmh.vmh_cpuids[i].d);
663 	}
664 
665 	if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
666 		return (-1);
667 
668 	return (0);
669 }
670 
671 int
672 dump_mem(int fd, struct vm_create_params *vcp)
673 {
674 	unsigned int	i;
675 	int		ret;
676 	struct		vm_mem_range *vmr;
677 
678 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
679 		vmr = &vcp->vcp_memranges[i];
680 		ret = dump_vmr(fd, vmr);
681 		if (ret)
682 			return ret;
683 	}
684 	return (0);
685 }
686 
687 int
688 restore_vm_params(int fd, struct vm_create_params *vcp) {
689 	unsigned int			i;
690 	struct vm_rwvmparams_params    vpp;
691 
692 	for (i = 0; i < vcp->vcp_ncpus; i++) {
693 		if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) {
694 			log_warn("%s: error restoring vm params", __func__);
695 			return (-1);
696 		}
697 		vpp.vpp_vm_id = vcp->vcp_id;
698 		vpp.vpp_vcpu_id = i;
699 		if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) {
700 			log_debug("%s: writing vm params failed", __func__);
701 			return (-1);
702 		}
703 	}
704 	return (0);
705 }
706 
707 void
708 restore_mem(int fd, struct vm_create_params *vcp)
709 {
710 	unsigned int	     i;
711 	struct vm_mem_range *vmr;
712 
713 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
714 		vmr = &vcp->vcp_memranges[i];
715 		restore_vmr(fd, vmr);
716 	}
717 }
718 
719 int
720 dump_vmr(int fd, struct vm_mem_range *vmr)
721 {
722 	size_t	rem = vmr->vmr_size, read=0;
723 	char	buf[PAGE_SIZE];
724 
725 	while (rem > 0) {
726 		if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) {
727 			log_warn("failed to read vmr");
728 			return (-1);
729 		}
730 		if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) {
731 			log_warn("failed to dump vmr");
732 			return (-1);
733 		}
734 		rem = rem - PAGE_SIZE;
735 		read = read + PAGE_SIZE;
736 	}
737 	return (0);
738 }
739 
740 void
741 restore_vmr(int fd, struct vm_mem_range *vmr)
742 {
743 	size_t	rem = vmr->vmr_size, wrote=0;
744 	char	buf[PAGE_SIZE];
745 
746 	while (rem > 0) {
747 		if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf))
748 			fatal("failed to restore vmr");
749 		if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE))
750 			fatal("failed to write vmr");
751 		rem = rem - PAGE_SIZE;
752 		wrote = wrote + PAGE_SIZE;
753 	}
754 }
755 
756 void
757 pause_vm(struct vm_create_params *vcp)
758 {
759 	unsigned int n;
760 	int ret;
761 	if (current_vm->vm_state & VM_STATE_PAUSED)
762 		return;
763 
764 	current_vm->vm_state |= VM_STATE_PAUSED;
765 
766 	ret = pthread_barrier_init(&vm_pause_barrier, NULL, vcp->vcp_ncpus + 1);
767 	if (ret) {
768 		log_warnx("%s: cannot initialize pause barrier (%d)",
769 		    __progname, ret);
770 		return;
771 	}
772 
773 	for (n = 0; n < vcp->vcp_ncpus; n++) {
774 		ret = pthread_cond_broadcast(&vcpu_run_cond[n]);
775 		if (ret) {
776 			log_warnx("%s: can't broadcast vcpu run cond (%d)",
777 			    __func__, (int)ret);
778 			return;
779 		}
780 	}
781 	ret = pthread_barrier_wait(&vm_pause_barrier);
782 	if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
783 		log_warnx("%s: could not wait on pause barrier (%d)",
784 		    __func__, (int)ret);
785 		return;
786 	}
787 
788 	ret = pthread_barrier_destroy(&vm_pause_barrier);
789 	if (ret) {
790 		log_warnx("%s: could not destroy pause barrier (%d)",
791 		    __progname, ret);
792 		return;
793 	}
794 
795 	i8253_stop();
796 	mc146818_stop();
797 	ns8250_stop();
798 	virtio_stop(vcp);
799 }
800 
801 void
802 unpause_vm(struct vm_create_params *vcp)
803 {
804 	unsigned int n;
805 	int ret;
806 	if (!(current_vm->vm_state & VM_STATE_PAUSED))
807 		return;
808 
809 	current_vm->vm_state &= ~VM_STATE_PAUSED;
810 	for (n = 0; n < vcp->vcp_ncpus; n++) {
811 		ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]);
812 		if (ret) {
813 			log_warnx("%s: can't broadcast vcpu unpause cond (%d)",
814 			    __func__, (int)ret);
815 			return;
816 		}
817 	}
818 
819 	i8253_start();
820 	mc146818_start();
821 	ns8250_start();
822 	virtio_start(vcp);
823 }
824 
825 /*
826  * vcpu_reset
827  *
828  * Requests vmm(4) to reset the VCPUs in the indicated VM to
829  * the register state provided
830  *
831  * Parameters
832  *  vmid: VM ID to reset
833  *  vcpu_id: VCPU ID to reset
834  *  vrs: the register state to initialize
835  *
836  * Return values:
837  *  0: success
838  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
839  *      valid)
840  */
841 int
842 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
843 {
844 	struct vm_resetcpu_params vrp;
845 
846 	memset(&vrp, 0, sizeof(vrp));
847 	vrp.vrp_vm_id = vmid;
848 	vrp.vrp_vcpu_id = vcpu_id;
849 	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
850 
851 	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
852 
853 	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1)
854 		return (errno);
855 
856 	return (0);
857 }
858 
859 /*
860  * create_memory_map
861  *
862  * Sets up the guest physical memory ranges that the VM can access.
863  *
864  * Parameters:
865  *  vcp: VM create parameters describing the VM whose memory map
866  *       is being created
867  *
868  * Return values:
869  *  nothing
870  */
871 void
872 create_memory_map(struct vm_create_params *vcp)
873 {
874 	size_t len, mem_bytes, mem_mb;
875 
876 	mem_mb = vcp->vcp_memranges[0].vmr_size;
877 	vcp->vcp_nmemranges = 0;
878 	if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE)
879 		return;
880 
881 	mem_bytes = mem_mb * 1024 * 1024;
882 
883 	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
884 	len = LOWMEM_KB * 1024;
885 	vcp->vcp_memranges[0].vmr_gpa = 0x0;
886 	vcp->vcp_memranges[0].vmr_size = len;
887 	mem_bytes -= len;
888 
889 	/*
890 	 * Second memory region: LOWMEM_KB - 1MB.
891 	 *
892 	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
893 	 * We have to add this region, because some systems
894 	 * unconditionally write to 0xb8000 (VGA RAM), and
895 	 * we need to make sure that vmm(4) permits accesses
896 	 * to it. So allocate guest memory for it.
897 	 */
898 	len = 0x100000 - LOWMEM_KB * 1024;
899 	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
900 	vcp->vcp_memranges[1].vmr_size = len;
901 	mem_bytes -= len;
902 
903 	/* Make sure that we do not place physical memory into MMIO ranges. */
904 	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000)
905 		len = VMM_PCI_MMIO_BAR_BASE - 0x100000;
906 	else
907 		len = mem_bytes;
908 
909 	/* Third memory region: 1MB - (1MB + len) */
910 	vcp->vcp_memranges[2].vmr_gpa = 0x100000;
911 	vcp->vcp_memranges[2].vmr_size = len;
912 	mem_bytes -= len;
913 
914 	if (mem_bytes > 0) {
915 		/* Fourth memory region for the remaining memory (if any) */
916 		vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
917 		vcp->vcp_memranges[3].vmr_size = mem_bytes;
918 		vcp->vcp_nmemranges = 4;
919 	} else
920 		vcp->vcp_nmemranges = 3;
921 }
922 
923 /*
924  * alloc_guest_mem
925  *
926  * Allocates memory for the guest.
927  * Instead of doing a single allocation with one mmap(), we allocate memory
928  * separately for every range for the following reasons:
929  * - ASLR for the individual ranges
930  * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
931  *   map the single mmap'd userspace memory to the individual guest physical
932  *   memory ranges, the underlying amap of the single mmap'd range would have
933  *   to allocate per-page reference counters. The reason is that the
934  *   individual guest physical ranges would reference the single mmap'd region
935  *   only partially. However, if every guest physical range has its own
936  *   corresponding mmap'd userspace allocation, there are no partial
937  *   references: every guest physical range fully references an mmap'd
938  *   range => no per-page reference counters have to be allocated.
939  *
940  * Return values:
941  *  0: success
942  *  !0: failure - errno indicating the source of the failure
943  */
944 int
945 alloc_guest_mem(struct vm_create_params *vcp)
946 {
947 	void *p;
948 	int ret;
949 	size_t i, j;
950 	struct vm_mem_range *vmr;
951 
952 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
953 		vmr = &vcp->vcp_memranges[i];
954 		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
955 		    MAP_PRIVATE | MAP_ANON, -1, 0);
956 		if (p == MAP_FAILED) {
957 			ret = errno;
958 			for (j = 0; j < i; j++) {
959 				vmr = &vcp->vcp_memranges[j];
960 				munmap((void *)vmr->vmr_va, vmr->vmr_size);
961 			}
962 
963 			return (ret);
964 		}
965 
966 		vmr->vmr_va = (vaddr_t)p;
967 	}
968 
969 	return (0);
970 }
971 
972 /*
973  * vmm_create_vm
974  *
975  * Requests vmm(4) to create a new VM using the supplied creation
976  * parameters. This operation results in the creation of the in-kernel
977  * structures for the VM, but does not start the VM's vcpu(s).
978  *
979  * Parameters:
980  *  vcp: vm_create_params struct containing the VM's desired creation
981  *      configuration
982  *
983  * Return values:
984  *  0: success
985  *  !0 : ioctl to vmm(4) failed
986  */
987 int
988 vmm_create_vm(struct vm_create_params *vcp)
989 {
990 	/* Sanity check arguments */
991 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
992 		return (EINVAL);
993 
994 	if (vcp->vcp_nmemranges == 0 ||
995 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
996 		return (EINVAL);
997 
998 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
999 		return (EINVAL);
1000 
1001 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
1002 		return (EINVAL);
1003 
1004 	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1)
1005 		return (errno);
1006 
1007 	return (0);
1008 }
1009 
1010 /*
1011  * init_emulated_hw
1012  *
1013  * Initializes the userspace hardware emulation
1014  */
1015 void
1016 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
1017     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1018 {
1019 	struct vm_create_params *vcp = &vmc->vmc_params;
1020 	int i;
1021 	uint64_t memlo, memhi;
1022 
1023 	/* Calculate memory size for NVRAM registers */
1024 	memlo = memhi = 0;
1025 	if (vcp->vcp_nmemranges > 2)
1026 		memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000;
1027 
1028 	if (vcp->vcp_nmemranges > 3)
1029 		memhi = vcp->vcp_memranges[3].vmr_size;
1030 
1031 	/* Reset the IO port map */
1032 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1033 
1034 	/* Init i8253 PIT */
1035 	i8253_init(vcp->vcp_id);
1036 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1037 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1038 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1039 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1040 	ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
1041 
1042 	/* Init mc146818 RTC */
1043 	mc146818_init(vcp->vcp_id, memlo, memhi);
1044 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1045 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1046 
1047 	/* Init master and slave PICs */
1048 	i8259_init();
1049 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1050 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1051 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1052 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1053 	ioports_map[ELCR0] = vcpu_exit_elcr;
1054 	ioports_map[ELCR1] = vcpu_exit_elcr;
1055 
1056 	/* Init ns8250 UART */
1057 	ns8250_init(con_fd, vcp->vcp_id);
1058 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1059 		ioports_map[i] = vcpu_exit_com;
1060 
1061 	/* Init QEMU fw_cfg interface */
1062 	fw_cfg_init(vmc);
1063 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1064 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1065 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1066 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1067 
1068 	/* Initialize PCI */
1069 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
1070 		ioports_map[i] = vcpu_exit_pci;
1071 
1072 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1073 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1074 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1075 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1076 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1077 	pci_init();
1078 
1079 	/* Initialize virtio devices */
1080 	virtio_init(current_vm, child_cdrom, child_disks, child_taps);
1081 }
1082 /*
1083  * restore_emulated_hw
1084  *
1085  * Restores the userspace hardware emulation from fd
1086  */
1087 void
1088 restore_emulated_hw(struct vm_create_params *vcp, int fd,
1089     int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
1090 {
1091 	/* struct vm_create_params *vcp = &vmc->vmc_params; */
1092 	int i;
1093 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1094 
1095 	/* Init i8253 PIT */
1096 	i8253_restore(fd, vcp->vcp_id);
1097 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1098 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1099 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1100 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1101 
1102 	/* Init master and slave PICs */
1103 	i8259_restore(fd);
1104 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1105 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1106 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1107 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1108 
1109 	/* Init ns8250 UART */
1110 	ns8250_restore(fd, con_fd, vcp->vcp_id);
1111 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1112 		ioports_map[i] = vcpu_exit_com;
1113 
1114 	/* Init mc146818 RTC */
1115 	mc146818_restore(fd, vcp->vcp_id);
1116 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1117 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1118 
1119 	/* Init QEMU fw_cfg interface */
1120 	fw_cfg_restore(fd);
1121 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1122 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1123 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1124 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1125 
1126 	/* Initialize PCI */
1127 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
1128 		ioports_map[i] = vcpu_exit_pci;
1129 
1130 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1131 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1132 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1133 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1134 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1135 	pci_restore(fd);
1136 	virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
1137 }
1138 
1139 /*
1140  * run_vm
1141  *
1142  * Runs the VM whose creation parameters are specified in vcp
1143  *
1144  * Parameters:
1145  *  child_cdrom: previously-opened child ISO disk file descriptor
1146  *  child_disks: previously-opened child VM disk file file descriptors
1147  *  child_taps: previously-opened child tap file descriptors
1148  *  vmc: vmop_create_params struct containing the VM's desired creation
1149  *      configuration
1150  *  vrs: VCPU register state to initialize
1151  *
1152  * Return values:
1153  *  0: the VM exited normally
1154  *  !0 : the VM exited abnormally or failed to start
1155  */
1156 int
1157 run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK],
1158     int *child_taps, struct vmop_create_params *vmc,
1159     struct vcpu_reg_state *vrs)
1160 {
1161 	struct vm_create_params *vcp = &vmc->vmc_params;
1162 	struct vm_rwregs_params vregsp;
1163 	uint8_t evdone = 0;
1164 	size_t i;
1165 	int ret;
1166 	pthread_t *tid, evtid;
1167 	struct vm_run_params **vrp;
1168 	void *exit_status;
1169 
1170 	if (vcp == NULL)
1171 		return (EINVAL);
1172 
1173 	if (child_cdrom == -1 && strlen(vcp->vcp_cdrom))
1174 		return (EINVAL);
1175 
1176 	if (child_disks == NULL && vcp->vcp_ndisks != 0)
1177 		return (EINVAL);
1178 
1179 	if (child_taps == NULL && vcp->vcp_nnics != 0)
1180 		return (EINVAL);
1181 
1182 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
1183 		return (EINVAL);
1184 
1185 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
1186 		return (EINVAL);
1187 
1188 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
1189 		return (EINVAL);
1190 
1191 	if (vcp->vcp_nmemranges == 0 ||
1192 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1193 		return (EINVAL);
1194 
1195 	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
1196 	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
1197 	if (tid == NULL || vrp == NULL) {
1198 		log_warn("%s: memory allocation error - exiting.",
1199 		    __progname);
1200 		return (ENOMEM);
1201 	}
1202 
1203 	log_debug("%s: initializing hardware for vm %s", __func__,
1204 	    vcp->vcp_name);
1205 
1206 	if (!(current_vm->vm_state & VM_STATE_RECEIVED))
1207 		init_emulated_hw(vmc, child_cdrom, child_disks, child_taps);
1208 
1209 	ret = pthread_mutex_init(&threadmutex, NULL);
1210 	if (ret) {
1211 		log_warn("%s: could not initialize thread state mutex",
1212 		    __func__);
1213 		return (ret);
1214 	}
1215 	ret = pthread_cond_init(&threadcond, NULL);
1216 	if (ret) {
1217 		log_warn("%s: could not initialize thread state "
1218 		    "condition variable", __func__);
1219 		return (ret);
1220 	}
1221 
1222 	mutex_lock(&threadmutex);
1223 
1224 	log_debug("%s: starting vcpu threads for vm %s", __func__,
1225 	    vcp->vcp_name);
1226 
1227 	/*
1228 	 * Create and launch one thread for each VCPU. These threads may
1229 	 * migrate between PCPUs over time; the need to reload CPU state
1230 	 * in such situations is detected and performed by vmm(4) in the
1231 	 * kernel.
1232 	 */
1233 	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
1234 		vrp[i] = malloc(sizeof(struct vm_run_params));
1235 		if (vrp[i] == NULL) {
1236 			log_warn("%s: memory allocation error - "
1237 			    "exiting.", __progname);
1238 			/* caller will exit, so skip freeing */
1239 			return (ENOMEM);
1240 		}
1241 		vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit));
1242 		if (vrp[i]->vrp_exit == NULL) {
1243 			log_warn("%s: memory allocation error - "
1244 			    "exiting.", __progname);
1245 			/* caller will exit, so skip freeing */
1246 			return (ENOMEM);
1247 		}
1248 		vrp[i]->vrp_vm_id = vcp->vcp_id;
1249 		vrp[i]->vrp_vcpu_id = i;
1250 
1251 		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
1252 			log_warnx("%s: cannot reset VCPU %zu - exiting.",
1253 			    __progname, i);
1254 			return (EIO);
1255 		}
1256 
1257 		/* once more because reset_cpu changes regs */
1258 		if (current_vm->vm_state & VM_STATE_RECEIVED) {
1259 			vregsp.vrwp_vm_id = vcp->vcp_id;
1260 			vregsp.vrwp_vcpu_id = i;
1261 			vregsp.vrwp_regs = *vrs;
1262 			vregsp.vrwp_mask = VM_RWREGS_ALL;
1263 			if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS,
1264 			    &vregsp)) == -1) {
1265 				log_warn("%s: writeregs failed", __func__);
1266 				return (ret);
1267 			}
1268 		}
1269 
1270 		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
1271 		if (ret) {
1272 			log_warnx("%s: cannot initialize cond var (%d)",
1273 			    __progname, ret);
1274 			return (ret);
1275 		}
1276 
1277 		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
1278 		if (ret) {
1279 			log_warnx("%s: cannot initialize mtx (%d)",
1280 			    __progname, ret);
1281 			return (ret);
1282 		}
1283 
1284 		ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL);
1285 		if (ret) {
1286 			log_warnx("%s: cannot initialize unpause var (%d)",
1287 			    __progname, ret);
1288 			return (ret);
1289 		}
1290 
1291 		ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL);
1292 		if (ret) {
1293 			log_warnx("%s: cannot initialize unpause mtx (%d)",
1294 			    __progname, ret);
1295 			return (ret);
1296 		}
1297 
1298 		vcpu_hlt[i] = 0;
1299 
1300 		/* Start each VCPU run thread at vcpu_run_loop */
1301 		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
1302 		if (ret) {
1303 			/* caller will _exit after this return */
1304 			ret = errno;
1305 			log_warn("%s: could not create vcpu thread %zu",
1306 			    __func__, i);
1307 			return (ret);
1308 		}
1309 	}
1310 
1311 	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
1312 	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
1313 	if (ret) {
1314 		errno = ret;
1315 		log_warn("%s: could not create event thread", __func__);
1316 		return (ret);
1317 	}
1318 
1319 	for (;;) {
1320 		ret = pthread_cond_wait(&threadcond, &threadmutex);
1321 		if (ret) {
1322 			log_warn("%s: waiting on thread state condition "
1323 			    "variable failed", __func__);
1324 			return (ret);
1325 		}
1326 
1327 		/*
1328 		 * Did a VCPU thread exit with an error? => return the first one
1329 		 */
1330 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1331 			if (vcpu_done[i] == 0)
1332 				continue;
1333 
1334 			if (pthread_join(tid[i], &exit_status)) {
1335 				log_warn("%s: failed to join thread %zd - "
1336 				    "exiting", __progname, i);
1337 				return (EIO);
1338 			}
1339 
1340 			ret = (intptr_t)exit_status;
1341 		}
1342 
1343 		/* Did the event thread exit? => return with an error */
1344 		if (evdone) {
1345 			if (pthread_join(evtid, &exit_status)) {
1346 				log_warn("%s: failed to join event thread - "
1347 				    "exiting", __progname);
1348 				return (EIO);
1349 			}
1350 
1351 			log_warnx("%s: vm %d event thread exited "
1352 			    "unexpectedly", __progname, vcp->vcp_id);
1353 			return (EIO);
1354 		}
1355 
1356 		/* Did all VCPU threads exit successfully? => return */
1357 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1358 			if (vcpu_done[i] == 0)
1359 				break;
1360 		}
1361 		if (i == vcp->vcp_ncpus)
1362 			return (ret);
1363 
1364 		/* Some more threads to wait for, start over */
1365 	}
1366 
1367 	return (ret);
1368 }
1369 
1370 void *
1371 event_thread(void *arg)
1372 {
1373 	uint8_t *donep = arg;
1374 	intptr_t ret;
1375 
1376 	ret = event_dispatch();
1377 
1378 	mutex_lock(&threadmutex);
1379 	*donep = 1;
1380 	pthread_cond_signal(&threadcond);
1381 	mutex_unlock(&threadmutex);
1382 
1383 	return (void *)ret;
1384  }
1385 
1386 /*
1387  * vcpu_run_loop
1388  *
1389  * Runs a single VCPU until vmm(4) requires help handling an exit,
1390  * or the VM terminates.
1391  *
1392  * Parameters:
1393  *  arg: vcpu_run_params for the VCPU being run by this thread
1394  *
1395  * Return values:
1396  *  NULL: the VCPU shutdown properly
1397  *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1398  */
1399 void *
1400 vcpu_run_loop(void *arg)
1401 {
1402 	struct vm_run_params *vrp = (struct vm_run_params *)arg;
1403 	intptr_t ret = 0;
1404 	int irq;
1405 	uint32_t n;
1406 
1407 	vrp->vrp_continue = 0;
1408 	n = vrp->vrp_vcpu_id;
1409 
1410 	for (;;) {
1411 		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1412 
1413 		if (ret) {
1414 			log_warnx("%s: can't lock vcpu run mtx (%d)",
1415 			    __func__, (int)ret);
1416 			return ((void *)ret);
1417 		}
1418 
1419 		/* If we are halted and need to pause, pause */
1420 		if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) {
1421 			ret = pthread_barrier_wait(&vm_pause_barrier);
1422 			if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
1423 				log_warnx("%s: could not wait on pause barrier (%d)",
1424 				    __func__, (int)ret);
1425 				return ((void *)ret);
1426 			}
1427 
1428 			ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]);
1429 			if (ret) {
1430 				log_warnx("%s: can't lock vcpu unpause mtx (%d)",
1431 				    __func__, (int)ret);
1432 				return ((void *)ret);
1433 			}
1434 
1435 			ret = pthread_cond_wait(&vcpu_unpause_cond[n],
1436 			    &vcpu_unpause_mtx[n]);
1437 			if (ret) {
1438 				log_warnx(
1439 				    "%s: can't wait on unpause cond (%d)",
1440 				    __func__, (int)ret);
1441 				break;
1442 			}
1443 			ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]);
1444 			if (ret) {
1445 				log_warnx("%s: can't unlock unpause mtx (%d)",
1446 				    __func__, (int)ret);
1447 				break;
1448 			}
1449 		}
1450 
1451 		/* If we are halted and not paused, wait */
1452 		if (vcpu_hlt[n]) {
1453 			ret = pthread_cond_wait(&vcpu_run_cond[n],
1454 			    &vcpu_run_mtx[n]);
1455 
1456 			if (ret) {
1457 				log_warnx(
1458 				    "%s: can't wait on cond (%d)",
1459 				    __func__, (int)ret);
1460 				(void)pthread_mutex_unlock(
1461 				    &vcpu_run_mtx[n]);
1462 				break;
1463 			}
1464 		}
1465 
1466 		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1467 
1468 		if (ret) {
1469 			log_warnx("%s: can't unlock mutex on cond (%d)",
1470 			    __func__, (int)ret);
1471 			break;
1472 		}
1473 
1474 		if (vrp->vrp_irqready && i8259_is_pending()) {
1475 			irq = i8259_ack();
1476 			vrp->vrp_irq = irq;
1477 		} else
1478 			vrp->vrp_irq = 0xFFFF;
1479 
1480 		/* Still more pending? */
1481 		if (i8259_is_pending()) {
1482 			/*
1483 			 * XXX can probably avoid ioctls here by providing intr
1484 			 * in vrp
1485 			 */
1486 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1487 			    vrp->vrp_vcpu_id, 1)) {
1488 				fatal("can't set INTR");
1489 			}
1490 		} else {
1491 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1492 			    vrp->vrp_vcpu_id, 0)) {
1493 				fatal("can't clear INTR");
1494 			}
1495 		}
1496 
1497 		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) {
1498 			/* If run ioctl failed, exit */
1499 			ret = errno;
1500 			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1501 			    __func__, vrp->vrp_vm_id, n);
1502 			break;
1503 		}
1504 
1505 		/* If the VM is terminating, exit normally */
1506 		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
1507 			ret = (intptr_t)NULL;
1508 			break;
1509 		}
1510 
1511 		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
1512 			/*
1513 			 * vmm(4) needs help handling an exit, handle in
1514 			 * vcpu_exit.
1515 			 */
1516 			ret = vcpu_exit(vrp);
1517 			if (ret)
1518 				break;
1519 		}
1520 	}
1521 
1522 	mutex_lock(&threadmutex);
1523 	vcpu_done[n] = 1;
1524 	pthread_cond_signal(&threadcond);
1525 	mutex_unlock(&threadmutex);
1526 
1527 	return ((void *)ret);
1528 }
1529 
1530 int
1531 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1532 {
1533 	struct vm_intr_params vip;
1534 
1535 	memset(&vip, 0, sizeof(vip));
1536 
1537 	vip.vip_vm_id = vm_id;
1538 	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1539 	vip.vip_intr = intr;
1540 
1541 	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1)
1542 		return (errno);
1543 
1544 	return (0);
1545 }
1546 
1547 /*
1548  * vcpu_exit_pci
1549  *
1550  * Handle all I/O to the emulated PCI subsystem.
1551  *
1552  * Parameters:
1553  *  vrp: vcpu run paramters containing guest state for this exit
1554  *
1555  * Return value:
1556  *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
1557  *      be injected.
1558  */
1559 uint8_t
1560 vcpu_exit_pci(struct vm_run_params *vrp)
1561 {
1562 	struct vm_exit *vei = vrp->vrp_exit;
1563 	uint8_t intr;
1564 
1565 	intr = 0xFF;
1566 
1567 	switch (vei->vei.vei_port) {
1568 	case PCI_MODE1_ADDRESS_REG:
1569 		pci_handle_address_reg(vrp);
1570 		break;
1571 	case PCI_MODE1_DATA_REG:
1572 	case PCI_MODE1_DATA_REG + 1:
1573 	case PCI_MODE1_DATA_REG + 2:
1574 	case PCI_MODE1_DATA_REG + 3:
1575 		pci_handle_data_reg(vrp);
1576 		break;
1577 	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
1578 		intr = pci_handle_io(vrp);
1579 		break;
1580 	default:
1581 		log_warnx("%s: unknown PCI register 0x%llx",
1582 		    __progname, (uint64_t)vei->vei.vei_port);
1583 		break;
1584 	}
1585 
1586 	return (intr);
1587 }
1588 
1589 /*
1590  * vcpu_exit_inout
1591  *
1592  * Handle all I/O exits that need to be emulated in vmd. This includes the
1593  * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
1594  *
1595  * Parameters:
1596  *  vrp: vcpu run parameters containing guest state for this exit
1597  */
1598 void
1599 vcpu_exit_inout(struct vm_run_params *vrp)
1600 {
1601 	struct vm_exit *vei = vrp->vrp_exit;
1602 	uint8_t intr = 0xFF;
1603 
1604 	if (ioports_map[vei->vei.vei_port] != NULL)
1605 		intr = ioports_map[vei->vei.vei_port](vrp);
1606 	else if (vei->vei.vei_dir == VEI_DIR_IN)
1607 			set_return_data(vei, 0xFFFFFFFF);
1608 
1609 	if (intr != 0xFF)
1610 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
1611 }
1612 
1613 /*
1614  * vcpu_exit_eptviolation
1615  *
1616  * handle an EPT Violation
1617  *
1618  * Parameters:
1619  *  vrp: vcpu run parameters containing guest state for this exit
1620  *
1621  * Return values:
1622  *  0: no action required
1623  *  EAGAIN: a protection fault occured, kill the vm.
1624  */
1625 int
1626 vcpu_exit_eptviolation(struct vm_run_params *vrp)
1627 {
1628 	struct vm_exit *ve = vrp->vrp_exit;
1629 
1630 	/*
1631 	 * vmd may be exiting to vmd to handle a pending interrupt
1632 	 * but last exit type may have been VMX_EXIT_EPT_VIOLATION,
1633 	 * check the fault_type to ensure we really are processing
1634 	 * a VMX_EXIT_EPT_VIOLATION.
1635 	 */
1636 	if (ve->vee.vee_fault_type == VEE_FAULT_PROTECT) {
1637 		log_debug("%s: EPT Violation: rip=0x%llx",
1638 		    __progname, vrp->vrp_exit->vrs.vrs_gprs[VCPU_REGS_RIP]);
1639 		return (EAGAIN);
1640 	}
1641 
1642 	return (0);
1643 }
1644 
1645 /*
1646  * vcpu_exit
1647  *
1648  * Handle a vcpu exit. This function is called when it is determined that
1649  * vmm(4) requires the assistance of vmd to support a particular guest
1650  * exit type (eg, accessing an I/O port or device). Guest state is contained
1651  * in 'vrp', and will be resent to vmm(4) on exit completion.
1652  *
1653  * Upon conclusion of handling the exit, the function determines if any
1654  * interrupts should be injected into the guest, and asserts the proper
1655  * IRQ line whose interrupt should be vectored.
1656  *
1657  * Parameters:
1658  *  vrp: vcpu run parameters containing guest state for this exit
1659  *
1660  * Return values:
1661  *  0: the exit was handled successfully
1662  *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
1663  */
1664 int
1665 vcpu_exit(struct vm_run_params *vrp)
1666 {
1667 	int ret;
1668 
1669 	switch (vrp->vrp_exit_reason) {
1670 	case VMX_EXIT_INT_WINDOW:
1671 	case SVM_VMEXIT_VINTR:
1672 	case VMX_EXIT_CPUID:
1673 	case VMX_EXIT_EXTINT:
1674 	case SVM_VMEXIT_INTR:
1675 	case SVM_VMEXIT_NPF:
1676 	case SVM_VMEXIT_MSR:
1677 	case SVM_VMEXIT_CPUID:
1678 		/*
1679 		 * We may be exiting to vmd to handle a pending interrupt but
1680 		 * at the same time the last exit type may have been one of
1681 		 * these. In this case, there's nothing extra to be done
1682 		 * here (and falling through to the default case below results
1683 		 * in more vmd log spam).
1684 		 */
1685 		break;
1686 	case VMX_EXIT_EPT_VIOLATION:
1687 		ret = vcpu_exit_eptviolation(vrp);
1688 		if (ret)
1689 			return (ret);
1690 
1691 		break;
1692 	case VMX_EXIT_IO:
1693 	case SVM_VMEXIT_IOIO:
1694 		vcpu_exit_inout(vrp);
1695 		break;
1696 	case VMX_EXIT_HLT:
1697 	case SVM_VMEXIT_HLT:
1698 		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1699 		if (ret) {
1700 			log_warnx("%s: can't lock vcpu mutex (%d)",
1701 			    __func__, ret);
1702 			return (ret);
1703 		}
1704 		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
1705 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1706 		if (ret) {
1707 			log_warnx("%s: can't unlock vcpu mutex (%d)",
1708 			    __func__, ret);
1709 			return (ret);
1710 		}
1711 		break;
1712 	case VMX_EXIT_TRIPLE_FAULT:
1713 	case SVM_VMEXIT_SHUTDOWN:
1714 		/* reset VM */
1715 		return (EAGAIN);
1716 	default:
1717 		log_debug("%s: unknown exit reason 0x%x",
1718 		    __progname, vrp->vrp_exit_reason);
1719 	}
1720 
1721 	vrp->vrp_continue = 1;
1722 
1723 	return (0);
1724 }
1725 
1726 /*
1727  * find_gpa_range
1728  *
1729  * Search for a contiguous guest physical mem range.
1730  *
1731  * Parameters:
1732  *  vcp: VM create parameters that contain the memory map to search in
1733  *  gpa: the starting guest physical address
1734  *  len: the length of the memory range
1735  *
1736  * Return values:
1737  *  NULL: on failure if there is no memory range as described by the parameters
1738  *  Pointer to vm_mem_range that contains the start of the range otherwise.
1739  */
1740 static struct vm_mem_range *
1741 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
1742 {
1743 	size_t i, n;
1744 	struct vm_mem_range *vmr;
1745 
1746 	/* Find the first vm_mem_range that contains gpa */
1747 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1748 		vmr = &vcp->vcp_memranges[i];
1749 		if (vmr->vmr_gpa + vmr->vmr_size >= gpa)
1750 			break;
1751 	}
1752 
1753 	/* No range found. */
1754 	if (i == vcp->vcp_nmemranges)
1755 		return (NULL);
1756 
1757 	/*
1758 	 * vmr may cover the range [gpa, gpa + len) only partly. Make
1759 	 * sure that the following vm_mem_ranges are contiguous and
1760 	 * cover the rest.
1761 	 */
1762 	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
1763 	if (len < n)
1764 		len = 0;
1765 	else
1766 		len -= n;
1767 	gpa = vmr->vmr_gpa + vmr->vmr_size;
1768 	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
1769 		vmr = &vcp->vcp_memranges[i];
1770 		if (gpa != vmr->vmr_gpa)
1771 			return (NULL);
1772 		if (len <= vmr->vmr_size)
1773 			len = 0;
1774 		else
1775 			len -= vmr->vmr_size;
1776 
1777 		gpa = vmr->vmr_gpa + vmr->vmr_size;
1778 	}
1779 
1780 	if (len != 0)
1781 		return (NULL);
1782 
1783 	return (vmr);
1784 }
1785 
1786 /*
1787  * write_mem
1788  *
1789  * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
1790  *
1791  * Parameters:
1792  *  dst: the destination paddr_t in the guest VM
1793  *  buf: data to copy (or NULL to zero the data)
1794  *  len: number of bytes to copy
1795  *
1796  * Return values:
1797  *  0: success
1798  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1799  *      exist in the guest.
1800  */
1801 int
1802 write_mem(paddr_t dst, const void *buf, size_t len)
1803 {
1804 	const char *from = buf;
1805 	char *to;
1806 	size_t n, off;
1807 	struct vm_mem_range *vmr;
1808 
1809 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
1810 	if (vmr == NULL) {
1811 		errno = EINVAL;
1812 		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
1813 		    "len = 0x%zx", __func__, dst, len);
1814 		return (EINVAL);
1815 	}
1816 
1817 	off = dst - vmr->vmr_gpa;
1818 	while (len != 0) {
1819 		n = vmr->vmr_size - off;
1820 		if (len < n)
1821 			n = len;
1822 
1823 		to = (char *)vmr->vmr_va + off;
1824 		if (buf == NULL)
1825 			memset(to, 0, n);
1826 		else {
1827 			memcpy(to, from, n);
1828 			from += n;
1829 		}
1830 		len -= n;
1831 		off = 0;
1832 		vmr++;
1833 	}
1834 
1835 	return (0);
1836 }
1837 
1838 /*
1839  * read_mem
1840  *
1841  * Reads memory at guest paddr 'src' into 'buf'.
1842  *
1843  * Parameters:
1844  *  src: the source paddr_t in the guest VM to read from.
1845  *  buf: destination (local) buffer
1846  *  len: number of bytes to read
1847  *
1848  * Return values:
1849  *  0: success
1850  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1851  *      exist in the guest.
1852  */
1853 int
1854 read_mem(paddr_t src, void *buf, size_t len)
1855 {
1856 	char *from, *to = buf;
1857 	size_t n, off;
1858 	struct vm_mem_range *vmr;
1859 
1860 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
1861 	if (vmr == NULL) {
1862 		errno = EINVAL;
1863 		log_warn("%s: failed - invalid memory range src = 0x%lx, "
1864 		    "len = 0x%zx", __func__, src, len);
1865 		return (EINVAL);
1866 	}
1867 
1868 	off = src - vmr->vmr_gpa;
1869 	while (len != 0) {
1870 		n = vmr->vmr_size - off;
1871 		if (len < n)
1872 			n = len;
1873 
1874 		from = (char *)vmr->vmr_va + off;
1875 		memcpy(to, from, n);
1876 
1877 		to += n;
1878 		len -= n;
1879 		off = 0;
1880 		vmr++;
1881 	}
1882 
1883 	return (0);
1884 }
1885 
1886 /*
1887  * vcpu_assert_pic_irq
1888  *
1889  * Injects the specified IRQ on the supplied vcpu/vm
1890  *
1891  * Parameters:
1892  *  vm_id: VM ID to inject to
1893  *  vcpu_id: VCPU ID to inject to
1894  *  irq: IRQ to inject
1895  */
1896 void
1897 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1898 {
1899 	int ret;
1900 
1901 	i8259_assert_irq(irq);
1902 
1903 	if (i8259_is_pending()) {
1904 		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
1905 			fatalx("%s: can't assert INTR", __func__);
1906 
1907 		ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
1908 		if (ret)
1909 			fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
1910 
1911 		vcpu_hlt[vcpu_id] = 0;
1912 		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
1913 		if (ret)
1914 			fatalx("%s: can't signal (%d)", __func__, ret);
1915 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
1916 		if (ret)
1917 			fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
1918 	}
1919 }
1920 
1921 /*
1922  * vcpu_deassert_pic_irq
1923  *
1924  * Clears the specified IRQ on the supplied vcpu/vm
1925  *
1926  * Parameters:
1927  *  vm_id: VM ID to clear in
1928  *  vcpu_id: VCPU ID to clear in
1929  *  irq: IRQ to clear
1930  */
1931 void
1932 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1933 {
1934 	i8259_deassert_irq(irq);
1935 
1936 	if (!i8259_is_pending()) {
1937 		if (vcpu_pic_intr(vm_id, vcpu_id, 0))
1938 			fatalx("%s: can't deassert INTR for vm_id %d, "
1939 			    "vcpu_id %d", __func__, vm_id, vcpu_id);
1940 	}
1941 }
1942 
1943 /*
1944  * fd_hasdata
1945  *
1946  * Determines if data can be read from a file descriptor.
1947  *
1948  * Parameters:
1949  *  fd: the fd to check
1950  *
1951  * Return values:
1952  *  1 if data can be read from an fd, or 0 otherwise.
1953  */
1954 int
1955 fd_hasdata(int fd)
1956 {
1957 	struct pollfd pfd[1];
1958 	int nready, hasdata = 0;
1959 
1960 	pfd[0].fd = fd;
1961 	pfd[0].events = POLLIN;
1962 	nready = poll(pfd, 1, 0);
1963 	if (nready == -1)
1964 		log_warn("checking file descriptor for data failed");
1965 	else if (nready == 1 && pfd[0].revents & POLLIN)
1966 		hasdata = 1;
1967 	return (hasdata);
1968 }
1969 
1970 /*
1971  * mutex_lock
1972  *
1973  * Wrapper function for pthread_mutex_lock that does error checking and that
1974  * exits on failure
1975  */
1976 void
1977 mutex_lock(pthread_mutex_t *m)
1978 {
1979 	int ret;
1980 
1981 	ret = pthread_mutex_lock(m);
1982 	if (ret) {
1983 		errno = ret;
1984 		fatal("could not acquire mutex");
1985 	}
1986 }
1987 
1988 /*
1989  * mutex_unlock
1990  *
1991  * Wrapper function for pthread_mutex_unlock that does error checking and that
1992  * exits on failure
1993  */
1994 void
1995 mutex_unlock(pthread_mutex_t *m)
1996 {
1997 	int ret;
1998 
1999 	ret = pthread_mutex_unlock(m);
2000 	if (ret) {
2001 		errno = ret;
2002 		fatal("could not release mutex");
2003 	}
2004 }
2005 
2006 /*
2007  * set_return_data
2008  *
2009  * Utility function for manipulating register data in vm exit info structs. This
2010  * function ensures that the data is copied to the vei->vei.vei_data field with
2011  * the proper size for the operation being performed.
2012  *
2013  * Parameters:
2014  *  vei: exit information
2015  *  data: return data
2016  */
2017 void
2018 set_return_data(struct vm_exit *vei, uint32_t data)
2019 {
2020 	switch (vei->vei.vei_size) {
2021 	case 1:
2022 		vei->vei.vei_data &= ~0xFF;
2023 		vei->vei.vei_data |= (uint8_t)data;
2024 		break;
2025 	case 2:
2026 		vei->vei.vei_data &= ~0xFFFF;
2027 		vei->vei.vei_data |= (uint16_t)data;
2028 		break;
2029 	case 4:
2030 		vei->vei.vei_data = data;
2031 		break;
2032 	}
2033 }
2034 
2035 /*
2036  * get_input_data
2037  *
2038  * Utility function for manipulating register data in vm exit info
2039  * structs. This function ensures that the data is copied from the
2040  * vei->vei.vei_data field with the proper size for the operation being
2041  * performed.
2042  *
2043  * Parameters:
2044  *  vei: exit information
2045  *  data: location to store the result
2046  */
2047 void
2048 get_input_data(struct vm_exit *vei, uint32_t *data)
2049 {
2050 	switch (vei->vei.vei_size) {
2051 	case 1:
2052 		*data &= 0xFFFFFF00;
2053 		*data |= (uint8_t)vei->vei.vei_data;
2054 		break;
2055 	case 2:
2056 		*data &= 0xFFFF0000;
2057 		*data |= (uint16_t)vei->vei.vei_data;
2058 		break;
2059 	case 4:
2060 		*data = vei->vei.vei_data;
2061 		break;
2062 	default:
2063 		log_warnx("%s: invalid i/o size %d", __func__,
2064 		    vei->vei.vei_size);
2065 	}
2066 
2067 }
2068 
2069 /*
2070  * translate_gva
2071  *
2072  * Translates a guest virtual address to a guest physical address by walking
2073  * the currently active page table (if needed).
2074  *
2075  * Note - this function can possibly alter the supplied VCPU state.
2076  *  Specifically, it may inject exceptions depending on the current VCPU
2077  *  configuration, and may alter %cr2 on #PF. Consequently, this function
2078  *  should only be used as part of instruction emulation.
2079  *
2080  * Parameters:
2081  *  exit: The VCPU this translation should be performed for (guest MMU settings
2082  *   are gathered from this VCPU)
2083  *  va: virtual address to translate
2084  *  pa: pointer to paddr_t variable that will receive the translated physical
2085  *   address. 'pa' is unchanged on error.
2086  *  mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
2087  *   the address should be translated
2088  *
2089  * Return values:
2090  *  0: the address was successfully translated - 'pa' contains the physical
2091  *     address currently mapped by 'va'.
2092  *  EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
2093  *     and %cr2 set in the vcpu structure.
2094  *  EINVAL: an error occurred reading paging table structures
2095  */
2096 int
2097 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
2098 {
2099 	int level, shift, pdidx;
2100 	uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
2101 	uint64_t shift_width, pte_size;
2102 	struct vcpu_reg_state *vrs;
2103 
2104 	vrs = &exit->vrs;
2105 
2106 	if (!pa)
2107 		return (EINVAL);
2108 
2109 	if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
2110 		log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
2111 		*pa = va;
2112 		return (0);
2113 	}
2114 
2115 	pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
2116 
2117 	log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
2118 	    vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
2119 
2120 	if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
2121 		if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
2122 			pte_size = sizeof(uint64_t);
2123 			shift_width = 9;
2124 
2125 			if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
2126 				/* 4 level paging */
2127 				level = 4;
2128 				mask = L4_MASK;
2129 				shift = L4_SHIFT;
2130 			} else {
2131 				/* 32 bit with PAE paging */
2132 				level = 3;
2133 				mask = L3_MASK;
2134 				shift = L3_SHIFT;
2135 			}
2136 		} else {
2137 			/* 32 bit paging */
2138 			level = 2;
2139 			shift_width = 10;
2140 			mask = 0xFFC00000;
2141 			shift = 22;
2142 			pte_size = sizeof(uint32_t);
2143 		}
2144 	} else
2145 		return (EINVAL);
2146 
2147 	/* XXX: Check for R bit in segment selector and set A bit */
2148 
2149 	for (;level > 0; level--) {
2150 		pdidx = (va & mask) >> shift;
2151 		pte_paddr = (pt_paddr) + (pdidx * pte_size);
2152 
2153 		log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
2154 		    level, pte_paddr);
2155 		if (read_mem(pte_paddr, &pte, pte_size)) {
2156 			log_warn("%s: failed to read pte", __func__);
2157 			return (EFAULT);
2158 		}
2159 
2160 		log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
2161 		    pte);
2162 
2163 		/* XXX: Set CR2  */
2164 		if (!(pte & PG_V))
2165 			return (EFAULT);
2166 
2167 		/* XXX: Check for SMAP */
2168 		if ((mode == PROT_WRITE) && !(pte & PG_RW))
2169 			return (EPERM);
2170 
2171 		if ((exit->cpl > 0) && !(pte & PG_u))
2172 			return (EPERM);
2173 
2174 		pte = pte | PG_U;
2175 		if (mode == PROT_WRITE)
2176 			pte = pte | PG_M;
2177 		if (write_mem(pte_paddr, &pte, pte_size)) {
2178 			log_warn("%s: failed to write back flags to pte",
2179 			    __func__);
2180 			return (EIO);
2181 		}
2182 
2183 		/* XXX: EINVAL if in 32bit and  PG_PS is 1 but CR4.PSE is 0 */
2184 		if (pte & PG_PS)
2185 			break;
2186 
2187 		if (level > 1) {
2188 			pt_paddr = pte & PG_FRAME;
2189 			shift -= shift_width;
2190 			mask = mask >> shift_width;
2191 		}
2192 	}
2193 
2194 	low_mask = (1 << shift) - 1;
2195 	high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
2196 	*pa = (pte & high_mask) | (va & low_mask);
2197 
2198 	log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
2199 
2200 	return (0);
2201 }
2202 
2203 /*
2204  * vm_pipe_init
2205  *
2206  * Initialize a vm_dev_pipe, setting up its file descriptors and its
2207  * event structure with the given callback.
2208  *
2209  * Parameters:
2210  *  p: pointer to vm_dev_pipe struct to initizlize
2211  *  cb: callback to use for READ events on the read end of the pipe
2212  */
2213 void
2214 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *))
2215 {
2216 	int ret;
2217 	int fds[2];
2218 
2219 	memset(p, 0, sizeof(struct vm_dev_pipe));
2220 
2221 	ret = pipe(fds);
2222 	if (ret)
2223 		fatal("failed to create vm_dev_pipe pipe");
2224 
2225 	p->read = fds[0];
2226 	p->write = fds[1];
2227 
2228 	event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL);
2229 }
2230 
2231 /*
2232  * vm_pipe_send
2233  *
2234  * Send a message to an emulated device vie the provided vm_dev_pipe.
2235  *
2236  * Parameters:
2237  *  p: pointer to initialized vm_dev_pipe
2238  *  msg: message to send in the channel
2239  */
2240 void
2241 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg)
2242 {
2243 	size_t n;
2244 	n = write(p->write, &msg, sizeof(msg));
2245 	if (n != sizeof(msg))
2246 		fatal("failed to write to device pipe");
2247 }
2248 
2249 /*
2250  * vm_pipe_recv
2251  *
2252  * Receive a message for an emulated device via the provided vm_dev_pipe.
2253  * Returns the message value, otherwise will exit on failure.
2254  *
2255  * Parameters:
2256  *  p: pointer to initialized vm_dev_pipe
2257  *
2258  * Return values:
2259  *  a value of enum pipe_msg_type or fatal exit on read(2) error
2260  */
2261 enum pipe_msg_type
2262 vm_pipe_recv(struct vm_dev_pipe *p)
2263 {
2264 	size_t n;
2265 	enum pipe_msg_type msg;
2266 	n = read(p->read, &msg, sizeof(msg));
2267 	if (n != sizeof(msg))
2268 		fatal("failed to read from device pipe");
2269 
2270 	return msg;
2271 }
2272