xref: /openbsd-src/usr.sbin/vmd/vm.c (revision 4e1ee0786f11cc571bd0be17d38e46f635c719fc)
1 /*	$OpenBSD: vm.c,v 1.65 2021/09/01 11:08:21 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/ioctl.h>
21 #include <sys/queue.h>
22 #include <sys/wait.h>
23 #include <sys/uio.h>
24 #include <sys/stat.h>
25 #include <sys/socket.h>
26 #include <sys/time.h>
27 #include <sys/mman.h>
28 
29 #include <dev/ic/i8253reg.h>
30 #include <dev/isa/isareg.h>
31 #include <dev/pci/pcireg.h>
32 
33 #include <machine/param.h>
34 #include <machine/psl.h>
35 #include <machine/pte.h>
36 #include <machine/specialreg.h>
37 #include <machine/vmmvar.h>
38 
39 #include <net/if.h>
40 
41 #include <errno.h>
42 #include <event.h>
43 #include <fcntl.h>
44 #include <imsg.h>
45 #include <limits.h>
46 #include <poll.h>
47 #include <pthread.h>
48 #include <stddef.h>
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <unistd.h>
53 #include <util.h>
54 
55 #include "atomicio.h"
56 #include "fw_cfg.h"
57 #include "i8253.h"
58 #include "i8259.h"
59 #include "loadfile.h"
60 #include "mc146818.h"
61 #include "ns8250.h"
62 #include "pci.h"
63 #include "virtio.h"
64 #include "vmd.h"
65 #include "vmm.h"
66 
67 io_fn_t ioports_map[MAX_PORTS];
68 
69 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
70     struct vmop_create_params *, struct vcpu_reg_state *);
71 void vm_dispatch_vmm(int, short, void *);
72 void *event_thread(void *);
73 void *vcpu_run_loop(void *);
74 int vcpu_exit(struct vm_run_params *);
75 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
76 void create_memory_map(struct vm_create_params *);
77 int alloc_guest_mem(struct vm_create_params *);
78 int vmm_create_vm(struct vm_create_params *);
79 void init_emulated_hw(struct vmop_create_params *, int,
80     int[][VM_MAX_BASE_PER_DISK], int *);
81 void restore_emulated_hw(struct vm_create_params *, int, int *,
82     int[][VM_MAX_BASE_PER_DISK],int);
83 void vcpu_exit_inout(struct vm_run_params *);
84 int vcpu_exit_eptviolation(struct vm_run_params *);
85 uint8_t vcpu_exit_pci(struct vm_run_params *);
86 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
87 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
88 int send_vm(int, struct vm_create_params *);
89 int dump_send_header(int);
90 int dump_vmr(int , struct vm_mem_range *);
91 int dump_mem(int, struct vm_create_params *);
92 void restore_vmr(int, struct vm_mem_range *);
93 void restore_mem(int, struct vm_create_params *);
94 int restore_vm_params(int, struct vm_create_params *);
95 void pause_vm(struct vm_create_params *);
96 void unpause_vm(struct vm_create_params *);
97 
98 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
99 
100 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
101     size_t);
102 
103 int con_fd;
104 struct vmd_vm *current_vm;
105 
106 extern struct vmd *env;
107 
108 extern char *__progname;
109 
110 pthread_mutex_t threadmutex;
111 pthread_cond_t threadcond;
112 
113 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
114 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
115 pthread_barrier_t vm_pause_barrier;
116 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM];
117 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM];
118 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
119 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
120 
121 /*
122  * Represents a standard register set for an OS to be booted
123  * as a flat 64 bit address space.
124  *
125  * NOT set here are:
126  *  RIP
127  *  RSP
128  *  GDTR BASE
129  *
130  * Specific bootloaders should clone this structure and override
131  * those fields as needed.
132  *
133  * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
134  *        features of the CPU in use.
135  */
136 static const struct vcpu_reg_state vcpu_init_flat64 = {
137 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
138 	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
139 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
140 	.vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
141 	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
142 	.vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
143 	.vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
144 	.vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
145 	.vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
146 	.vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
147 	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
148 	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
149 	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
150 	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
151 	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
152 	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
153 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
154 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
155 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
156 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
157 	.vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
158 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
159 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
160 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
161 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
162 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
163 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
164 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
165 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
166 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
167 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
168 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
169 	.vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
170 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
171 };
172 
173 /*
174  * Represents a standard register set for an BIOS to be booted
175  * as a flat 16 bit address space.
176  */
177 static const struct vcpu_reg_state vcpu_init_flat16 = {
178 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
179 	.vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
180 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
181 	.vrs_crs[VCPU_REGS_CR0] = 0x60000010,
182 	.vrs_crs[VCPU_REGS_CR3] = 0,
183 	.vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
184 	.vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
185 	.vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
186 	.vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
187 	.vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
188 	.vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
189 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
190 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
191 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
192 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
193 	.vrs_msrs[VCPU_REGS_EFER] = 0ULL,
194 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
195 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
196 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
197 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
198 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
199 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
200 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
201 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
202 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
203 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
204 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
205 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
206 };
207 
208 /*
209  * loadfile_bios
210  *
211  * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
212  * directly into memory.
213  *
214  * Parameters:
215  *  fp: file of a kernel file to load
216  *  size: uncompressed size of the image
217  *  (out) vrs: register state to set on init for this kernel
218  *
219  * Return values:
220  *  0 if successful
221  *  various error codes returned from read(2) or loadelf functions
222  */
223 int
224 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
225 {
226 	off_t	 off;
227 
228 	/* Set up a "flat 16 bit" register state for BIOS */
229 	memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
230 
231 	/* Seek to the beginning of the BIOS image */
232 	if (gzseek(fp, 0, SEEK_SET) == -1)
233 		return (-1);
234 
235 	/* The BIOS image must end at 1M */
236 	if ((off = 1048576 - size) < 0)
237 		return (-1);
238 
239 	/* Read BIOS image into memory */
240 	if (mread(fp, off, size) != (size_t)size) {
241 		errno = EIO;
242 		return (-1);
243 	}
244 
245 	log_debug("%s: loaded BIOS image", __func__);
246 
247 	return (0);
248 }
249 
250 /*
251  * start_vm
252  *
253  * After forking a new VM process, starts the new VM with the creation
254  * parameters supplied (in the incoming vm->vm_params field). This
255  * function performs a basic sanity check on the incoming parameters
256  * and then performs the following steps to complete the creation of the VM:
257  *
258  * 1. validates and create the new VM
259  * 2. opens the imsg control channel to the parent and drops more privilege
260  * 3. drops additional privleges by calling pledge(2)
261  * 4. loads the kernel from the disk image or file descriptor
262  * 5. runs the VM's VCPU loops.
263  *
264  * Parameters:
265  *  vm: The VM data structure that is including the VM create parameters.
266  *  fd: The imsg socket that is connected to the parent process.
267  *
268  * Return values:
269  *  0: success
270  *  !0 : failure - typically an errno indicating the source of the failure
271  */
272 int
273 start_vm(struct vmd_vm *vm, int fd)
274 {
275 	struct vmop_create_params *vmc = &vm->vm_params;
276 	struct vm_create_params	*vcp = &vmc->vmc_params;
277 	struct vcpu_reg_state	 vrs;
278 	int			 nicfds[VMM_MAX_NICS_PER_VM];
279 	int			 ret;
280 	gzFile			 fp;
281 	size_t			 i;
282 	struct vm_rwregs_params  vrp;
283 	struct stat		 sb;
284 
285 	/* Child */
286 	setproctitle("%s", vcp->vcp_name);
287 	log_procinit(vcp->vcp_name);
288 
289 	if (!(vm->vm_state & VM_STATE_RECEIVED))
290 		create_memory_map(vcp);
291 
292 	ret = alloc_guest_mem(vcp);
293 
294 	if (ret) {
295 		errno = ret;
296 		fatal("could not allocate guest memory - exiting");
297 	}
298 
299 	ret = vmm_create_vm(vcp);
300 	current_vm = vm;
301 
302 	/* send back the kernel-generated vm id (0 on error) */
303 	if (write(fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
304 	    sizeof(vcp->vcp_id))
305 		fatal("write vcp id");
306 
307 	if (ret) {
308 		errno = ret;
309 		fatal("create vmm ioctl failed - exiting");
310 	}
311 
312 	/*
313 	 * pledge in the vm processes:
314 	 * stdio - for malloc and basic I/O including events.
315 	 * recvfd - for send/recv.
316 	 * vmm - for the vmm ioctls and operations.
317 	 */
318 	if (pledge("stdio vmm recvfd", NULL) == -1)
319 		fatal("pledge");
320 
321 	if (vm->vm_state & VM_STATE_RECEIVED) {
322 		ret = read(vm->vm_receive_fd, &vrp, sizeof(vrp));
323 		if (ret != sizeof(vrp)) {
324 			fatal("received incomplete vrp - exiting");
325 		}
326 		vrs = vrp.vrwp_regs;
327 	} else {
328 		/*
329 		 * Set up default "flat 64 bit" register state - RIP,
330 		 * RSP, and GDT info will be set in bootloader
331 		 */
332 		memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs));
333 
334 		/* Find and open kernel image */
335 		if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL)
336 			fatalx("failed to open kernel - exiting");
337 
338 		/* Load kernel image */
339 		ret = loadfile_elf(fp, vcp, &vrs);
340 
341 		/*
342 		 * Try BIOS as a fallback (only if it was provided as an image
343 		 * with vm->vm_kernel and the file is not compressed)
344 		 */
345 		if (ret && errno == ENOEXEC && vm->vm_kernel != -1 &&
346 		    gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
347 			ret = loadfile_bios(fp, sb.st_size, &vrs);
348 
349 		if (ret)
350 			fatal("failed to load kernel or BIOS - exiting");
351 
352 		gzclose(fp);
353 	}
354 
355 	if (vm->vm_kernel != -1)
356 		close(vm->vm_kernel);
357 
358 	con_fd = vm->vm_tty;
359 	if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
360 		fatal("failed to set nonblocking mode on console");
361 
362 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
363 		nicfds[i] = vm->vm_ifs[i].vif_fd;
364 
365 	event_init();
366 
367 	if (vm->vm_state & VM_STATE_RECEIVED) {
368 		restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
369 		    vm->vm_disks, vm->vm_cdrom);
370 		restore_mem(vm->vm_receive_fd, vcp);
371 		if (restore_vm_params(vm->vm_receive_fd, vcp))
372 			fatal("restore vm params failed");
373 		unpause_vm(vcp);
374 	}
375 
376 	if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
377 		fatal("setup vm pipe");
378 
379 	/* Execute the vcpu run loop(s) for this VM */
380 	ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs);
381 
382 	/* Ensure that any in-flight data is written back */
383 	virtio_shutdown(vm);
384 
385 	return (ret);
386 }
387 
388 /*
389  * vm_dispatch_vmm
390  *
391  * imsg callback for messages that are received from the vmm parent process.
392  */
393 void
394 vm_dispatch_vmm(int fd, short event, void *arg)
395 {
396 	struct vmd_vm		*vm = arg;
397 	struct vmop_result	 vmr;
398 	struct vmop_addr_result	 var;
399 	struct imsgev		*iev = &vm->vm_iev;
400 	struct imsgbuf		*ibuf = &iev->ibuf;
401 	struct imsg		 imsg;
402 	ssize_t			 n;
403 	int			 verbose;
404 
405 	if (event & EV_READ) {
406 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
407 			fatal("%s: imsg_read", __func__);
408 		if (n == 0)
409 			_exit(0);
410 	}
411 
412 	if (event & EV_WRITE) {
413 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
414 			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
415 		if (n == 0)
416 			_exit(0);
417 	}
418 
419 	for (;;) {
420 		if ((n = imsg_get(ibuf, &imsg)) == -1)
421 			fatal("%s: imsg_get", __func__);
422 		if (n == 0)
423 			break;
424 
425 #if DEBUG > 1
426 		log_debug("%s: got imsg %d from %s",
427 		    __func__, imsg.hdr.type,
428 		    vm->vm_params.vmc_params.vcp_name);
429 #endif
430 
431 		switch (imsg.hdr.type) {
432 		case IMSG_CTL_VERBOSE:
433 			IMSG_SIZE_CHECK(&imsg, &verbose);
434 			memcpy(&verbose, imsg.data, sizeof(verbose));
435 			log_setverbose(verbose);
436 			break;
437 		case IMSG_VMDOP_VM_SHUTDOWN:
438 			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
439 				_exit(0);
440 			break;
441 		case IMSG_VMDOP_VM_REBOOT:
442 			if (vmmci_ctl(VMMCI_REBOOT) == -1)
443 				_exit(0);
444 			break;
445 		case IMSG_VMDOP_PAUSE_VM:
446 			vmr.vmr_result = 0;
447 			vmr.vmr_id = vm->vm_vmid;
448 			pause_vm(&vm->vm_params.vmc_params);
449 			imsg_compose_event(&vm->vm_iev,
450 			    IMSG_VMDOP_PAUSE_VM_RESPONSE,
451 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
452 			    sizeof(vmr));
453 			break;
454 		case IMSG_VMDOP_UNPAUSE_VM:
455 			vmr.vmr_result = 0;
456 			vmr.vmr_id = vm->vm_vmid;
457 			unpause_vm(&vm->vm_params.vmc_params);
458 			imsg_compose_event(&vm->vm_iev,
459 			    IMSG_VMDOP_UNPAUSE_VM_RESPONSE,
460 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
461 			    sizeof(vmr));
462 			break;
463 		case IMSG_VMDOP_SEND_VM_REQUEST:
464 			vmr.vmr_id = vm->vm_vmid;
465 			vmr.vmr_result = send_vm(imsg.fd,
466 			    &vm->vm_params.vmc_params);
467 			imsg_compose_event(&vm->vm_iev,
468 			    IMSG_VMDOP_SEND_VM_RESPONSE,
469 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
470 			    sizeof(vmr));
471 			if (!vmr.vmr_result) {
472 				imsg_flush(&current_vm->vm_iev.ibuf);
473 				_exit(0);
474 			}
475 			break;
476 		case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
477 			IMSG_SIZE_CHECK(&imsg, &var);
478 			memcpy(&var, imsg.data, sizeof(var));
479 
480 			log_debug("%s: received tap addr %s for nic %d",
481 			    vm->vm_params.vmc_params.vcp_name,
482 			    ether_ntoa((void *)var.var_addr), var.var_nic_idx);
483 
484 			vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr);
485 			break;
486 		default:
487 			fatalx("%s: got invalid imsg %d from %s",
488 			    __func__, imsg.hdr.type,
489 			    vm->vm_params.vmc_params.vcp_name);
490 		}
491 		imsg_free(&imsg);
492 	}
493 	imsg_event_add(iev);
494 }
495 
496 /*
497  * vm_shutdown
498  *
499  * Tell the vmm parent process to shutdown or reboot the VM and exit.
500  */
501 __dead void
502 vm_shutdown(unsigned int cmd)
503 {
504 	switch (cmd) {
505 	case VMMCI_NONE:
506 	case VMMCI_SHUTDOWN:
507 		(void)imsg_compose_event(&current_vm->vm_iev,
508 		    IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0);
509 		break;
510 	case VMMCI_REBOOT:
511 		(void)imsg_compose_event(&current_vm->vm_iev,
512 		    IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0);
513 		break;
514 	default:
515 		fatalx("invalid vm ctl command: %d", cmd);
516 	}
517 	imsg_flush(&current_vm->vm_iev.ibuf);
518 
519 	_exit(0);
520 }
521 
522 int
523 send_vm(int fd, struct vm_create_params *vcp)
524 {
525 	struct vm_rwregs_params	   vrp;
526 	struct vm_rwvmparams_params vpp;
527 	struct vmop_create_params *vmc;
528 	struct vm_terminate_params vtp;
529 	unsigned int		   flags = 0;
530 	unsigned int		   i;
531 	int			   ret = 0;
532 	size_t			   sz;
533 
534 	if (dump_send_header(fd)) {
535 		log_info("%s: failed to send vm dump header", __func__);
536 		goto err;
537 	}
538 
539 	pause_vm(vcp);
540 
541 	vmc = calloc(1, sizeof(struct vmop_create_params));
542 	if (vmc == NULL) {
543 		log_warn("%s: calloc error geting vmc", __func__);
544 		ret = -1;
545 		goto err;
546 	}
547 
548 	flags |= VMOP_CREATE_MEMORY;
549 	memcpy(&vmc->vmc_params, &current_vm->vm_params, sizeof(struct
550 	    vmop_create_params));
551 	vmc->vmc_flags = flags;
552 	vrp.vrwp_vm_id = vcp->vcp_id;
553 	vrp.vrwp_mask = VM_RWREGS_ALL;
554 	vpp.vpp_mask = VM_RWVMPARAMS_ALL;
555 	vpp.vpp_vm_id = vcp->vcp_id;
556 
557 	sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params));
558 	if (sz != sizeof(struct vmop_create_params)) {
559 		ret = -1;
560 		goto err;
561 	}
562 
563 	for (i = 0; i < vcp->vcp_ncpus; i++) {
564 		vrp.vrwp_vcpu_id = i;
565 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) {
566 			log_warn("%s: readregs failed", __func__);
567 			goto err;
568 		}
569 
570 		sz = atomicio(vwrite, fd, &vrp,
571 		    sizeof(struct vm_rwregs_params));
572 		if (sz != sizeof(struct vm_rwregs_params)) {
573 			log_warn("%s: dumping registers failed", __func__);
574 			ret = -1;
575 			goto err;
576 		}
577 	}
578 
579 	if ((ret = i8253_dump(fd)))
580 		goto err;
581 	if ((ret = i8259_dump(fd)))
582 		goto err;
583 	if ((ret = ns8250_dump(fd)))
584 		goto err;
585 	if ((ret = mc146818_dump(fd)))
586 		goto err;
587 	if ((ret = fw_cfg_dump(fd)))
588 		goto err;
589 	if ((ret = pci_dump(fd)))
590 		goto err;
591 	if ((ret = virtio_dump(fd)))
592 		goto err;
593 	if ((ret = dump_mem(fd, vcp)))
594 		goto err;
595 
596 	for (i = 0; i < vcp->vcp_ncpus; i++) {
597 		vpp.vpp_vcpu_id = i;
598 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) {
599 			log_warn("%s: readvmparams failed", __func__);
600 			goto err;
601 		}
602 
603 		sz = atomicio(vwrite, fd, &vpp,
604 		    sizeof(struct vm_rwvmparams_params));
605 		if (sz != sizeof(struct vm_rwvmparams_params)) {
606 			log_warn("%s: dumping vm params failed", __func__);
607 			ret = -1;
608 			goto err;
609 		}
610 	}
611 
612 	vtp.vtp_vm_id = vcp->vcp_id;
613 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) {
614 		log_warnx("%s: term IOC error: %d, %d", __func__,
615 		    errno, ENOENT);
616 	}
617 err:
618 	close(fd);
619 	if (ret)
620 		unpause_vm(vcp);
621 	return ret;
622 }
623 
624 int
625 dump_send_header(int fd) {
626 	struct vm_dump_header	   vmh;
627 	int			   i;
628 
629 	memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
630 	    sizeof(vmh.vmh_signature));
631 
632 	vmh.vmh_cpuids[0].code = 0x00;
633 	vmh.vmh_cpuids[0].leaf = 0x00;
634 
635 	vmh.vmh_cpuids[1].code = 0x01;
636 	vmh.vmh_cpuids[1].leaf = 0x00;
637 
638 	vmh.vmh_cpuids[2].code = 0x07;
639 	vmh.vmh_cpuids[2].leaf = 0x00;
640 
641 	vmh.vmh_cpuids[3].code = 0x0d;
642 	vmh.vmh_cpuids[3].leaf = 0x00;
643 
644 	vmh.vmh_cpuids[4].code = 0x80000001;
645 	vmh.vmh_cpuids[4].leaf = 0x00;
646 
647 	vmh.vmh_version = VM_DUMP_VERSION;
648 
649 	for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
650 		CPUID_LEAF(vmh.vmh_cpuids[i].code,
651 		    vmh.vmh_cpuids[i].leaf,
652 		    vmh.vmh_cpuids[i].a,
653 		    vmh.vmh_cpuids[i].b,
654 		    vmh.vmh_cpuids[i].c,
655 		    vmh.vmh_cpuids[i].d);
656 	}
657 
658 	if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
659 		return (-1);
660 
661 	return (0);
662 }
663 
664 int
665 dump_mem(int fd, struct vm_create_params *vcp)
666 {
667 	unsigned int	i;
668 	int		ret;
669 	struct		vm_mem_range *vmr;
670 
671 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
672 		vmr = &vcp->vcp_memranges[i];
673 		ret = dump_vmr(fd, vmr);
674 		if (ret)
675 			return ret;
676 	}
677 	return (0);
678 }
679 
680 int
681 restore_vm_params(int fd, struct vm_create_params *vcp) {
682 	unsigned int			i;
683 	struct vm_rwvmparams_params    vpp;
684 
685 	for (i = 0; i < vcp->vcp_ncpus; i++) {
686 		if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) {
687 			log_warn("%s: error restoring vm params", __func__);
688 			return (-1);
689 		}
690 		vpp.vpp_vm_id = vcp->vcp_id;
691 		vpp.vpp_vcpu_id = i;
692 		if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) {
693 			log_debug("%s: writing vm params failed", __func__);
694 			return (-1);
695 		}
696 	}
697 	return (0);
698 }
699 
700 void
701 restore_mem(int fd, struct vm_create_params *vcp)
702 {
703 	unsigned int	     i;
704 	struct vm_mem_range *vmr;
705 
706 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
707 		vmr = &vcp->vcp_memranges[i];
708 		restore_vmr(fd, vmr);
709 	}
710 }
711 
712 int
713 dump_vmr(int fd, struct vm_mem_range *vmr)
714 {
715 	size_t	rem = vmr->vmr_size, read=0;
716 	char	buf[PAGE_SIZE];
717 
718 	while (rem > 0) {
719 		if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) {
720 			log_warn("failed to read vmr");
721 			return (-1);
722 		}
723 		if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) {
724 			log_warn("failed to dump vmr");
725 			return (-1);
726 		}
727 		rem = rem - PAGE_SIZE;
728 		read = read + PAGE_SIZE;
729 	}
730 	return (0);
731 }
732 
733 void
734 restore_vmr(int fd, struct vm_mem_range *vmr)
735 {
736 	size_t	rem = vmr->vmr_size, wrote=0;
737 	char	buf[PAGE_SIZE];
738 
739 	while (rem > 0) {
740 		if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf))
741 			fatal("failed to restore vmr");
742 		if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE))
743 			fatal("failed to write vmr");
744 		rem = rem - PAGE_SIZE;
745 		wrote = wrote + PAGE_SIZE;
746 	}
747 }
748 
749 void
750 pause_vm(struct vm_create_params *vcp)
751 {
752 	unsigned int n;
753 	int ret;
754 	if (current_vm->vm_state & VM_STATE_PAUSED)
755 		return;
756 
757 	current_vm->vm_state |= VM_STATE_PAUSED;
758 
759 	ret = pthread_barrier_init(&vm_pause_barrier, NULL, vcp->vcp_ncpus + 1);
760 	if (ret) {
761 		log_warnx("%s: cannot initialize pause barrier (%d)",
762 		    __progname, ret);
763 		return;
764 	}
765 
766 	for (n = 0; n < vcp->vcp_ncpus; n++) {
767 		ret = pthread_cond_broadcast(&vcpu_run_cond[n]);
768 		if (ret) {
769 			log_warnx("%s: can't broadcast vcpu run cond (%d)",
770 			    __func__, (int)ret);
771 			return;
772 		}
773 	}
774 	ret = pthread_barrier_wait(&vm_pause_barrier);
775 	if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
776 		log_warnx("%s: could not wait on pause barrier (%d)",
777 		    __func__, (int)ret);
778 		return;
779 	}
780 
781 	ret = pthread_barrier_destroy(&vm_pause_barrier);
782 	if (ret) {
783 		log_warnx("%s: could not destroy pause barrier (%d)",
784 		    __progname, ret);
785 		return;
786 	}
787 
788 	i8253_stop();
789 	mc146818_stop();
790 	ns8250_stop();
791 	virtio_stop(vcp);
792 }
793 
794 void
795 unpause_vm(struct vm_create_params *vcp)
796 {
797 	unsigned int n;
798 	int ret;
799 	if (!(current_vm->vm_state & VM_STATE_PAUSED))
800 		return;
801 
802 	current_vm->vm_state &= ~VM_STATE_PAUSED;
803 	for (n = 0; n < vcp->vcp_ncpus; n++) {
804 		ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]);
805 		if (ret) {
806 			log_warnx("%s: can't broadcast vcpu unpause cond (%d)",
807 			    __func__, (int)ret);
808 			return;
809 		}
810 	}
811 
812 	i8253_start();
813 	mc146818_start();
814 	ns8250_start();
815 	virtio_start(vcp);
816 }
817 
818 /*
819  * vcpu_reset
820  *
821  * Requests vmm(4) to reset the VCPUs in the indicated VM to
822  * the register state provided
823  *
824  * Parameters
825  *  vmid: VM ID to reset
826  *  vcpu_id: VCPU ID to reset
827  *  vrs: the register state to initialize
828  *
829  * Return values:
830  *  0: success
831  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
832  *      valid)
833  */
834 int
835 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
836 {
837 	struct vm_resetcpu_params vrp;
838 
839 	memset(&vrp, 0, sizeof(vrp));
840 	vrp.vrp_vm_id = vmid;
841 	vrp.vrp_vcpu_id = vcpu_id;
842 	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
843 
844 	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
845 
846 	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1)
847 		return (errno);
848 
849 	return (0);
850 }
851 
852 /*
853  * create_memory_map
854  *
855  * Sets up the guest physical memory ranges that the VM can access.
856  *
857  * Parameters:
858  *  vcp: VM create parameters describing the VM whose memory map
859  *       is being created
860  *
861  * Return values:
862  *  nothing
863  */
864 void
865 create_memory_map(struct vm_create_params *vcp)
866 {
867 	size_t len, mem_bytes, mem_mb;
868 
869 	mem_mb = vcp->vcp_memranges[0].vmr_size;
870 	vcp->vcp_nmemranges = 0;
871 	if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE)
872 		return;
873 
874 	mem_bytes = mem_mb * 1024 * 1024;
875 
876 	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
877 	len = LOWMEM_KB * 1024;
878 	vcp->vcp_memranges[0].vmr_gpa = 0x0;
879 	vcp->vcp_memranges[0].vmr_size = len;
880 	mem_bytes -= len;
881 
882 	/*
883 	 * Second memory region: LOWMEM_KB - 1MB.
884 	 *
885 	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
886 	 * We have to add this region, because some systems
887 	 * unconditionally write to 0xb8000 (VGA RAM), and
888 	 * we need to make sure that vmm(4) permits accesses
889 	 * to it. So allocate guest memory for it.
890 	 */
891 	len = 0x100000 - LOWMEM_KB * 1024;
892 	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
893 	vcp->vcp_memranges[1].vmr_size = len;
894 	mem_bytes -= len;
895 
896 	/* Make sure that we do not place physical memory into MMIO ranges. */
897 	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000)
898 		len = VMM_PCI_MMIO_BAR_BASE - 0x100000;
899 	else
900 		len = mem_bytes;
901 
902 	/* Third memory region: 1MB - (1MB + len) */
903 	vcp->vcp_memranges[2].vmr_gpa = 0x100000;
904 	vcp->vcp_memranges[2].vmr_size = len;
905 	mem_bytes -= len;
906 
907 	if (mem_bytes > 0) {
908 		/* Fourth memory region for the remaining memory (if any) */
909 		vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
910 		vcp->vcp_memranges[3].vmr_size = mem_bytes;
911 		vcp->vcp_nmemranges = 4;
912 	} else
913 		vcp->vcp_nmemranges = 3;
914 }
915 
916 /*
917  * alloc_guest_mem
918  *
919  * Allocates memory for the guest.
920  * Instead of doing a single allocation with one mmap(), we allocate memory
921  * separately for every range for the following reasons:
922  * - ASLR for the individual ranges
923  * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
924  *   map the single mmap'd userspace memory to the individual guest physical
925  *   memory ranges, the underlying amap of the single mmap'd range would have
926  *   to allocate per-page reference counters. The reason is that the
927  *   individual guest physical ranges would reference the single mmap'd region
928  *   only partially. However, if every guest physical range has its own
929  *   corresponding mmap'd userspace allocation, there are no partial
930  *   references: every guest physical range fully references an mmap'd
931  *   range => no per-page reference counters have to be allocated.
932  *
933  * Return values:
934  *  0: success
935  *  !0: failure - errno indicating the source of the failure
936  */
937 int
938 alloc_guest_mem(struct vm_create_params *vcp)
939 {
940 	void *p;
941 	int ret;
942 	size_t i, j;
943 	struct vm_mem_range *vmr;
944 
945 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
946 		vmr = &vcp->vcp_memranges[i];
947 		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
948 		    MAP_PRIVATE | MAP_ANON, -1, 0);
949 		if (p == MAP_FAILED) {
950 			ret = errno;
951 			for (j = 0; j < i; j++) {
952 				vmr = &vcp->vcp_memranges[j];
953 				munmap((void *)vmr->vmr_va, vmr->vmr_size);
954 			}
955 
956 			return (ret);
957 		}
958 
959 		vmr->vmr_va = (vaddr_t)p;
960 	}
961 
962 	return (0);
963 }
964 
965 /*
966  * vmm_create_vm
967  *
968  * Requests vmm(4) to create a new VM using the supplied creation
969  * parameters. This operation results in the creation of the in-kernel
970  * structures for the VM, but does not start the VM's vcpu(s).
971  *
972  * Parameters:
973  *  vcp: vm_create_params struct containing the VM's desired creation
974  *      configuration
975  *
976  * Return values:
977  *  0: success
978  *  !0 : ioctl to vmm(4) failed
979  */
980 int
981 vmm_create_vm(struct vm_create_params *vcp)
982 {
983 	/* Sanity check arguments */
984 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
985 		return (EINVAL);
986 
987 	if (vcp->vcp_nmemranges == 0 ||
988 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
989 		return (EINVAL);
990 
991 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
992 		return (EINVAL);
993 
994 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
995 		return (EINVAL);
996 
997 	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1)
998 		return (errno);
999 
1000 	return (0);
1001 }
1002 
1003 /*
1004  * init_emulated_hw
1005  *
1006  * Initializes the userspace hardware emulation
1007  */
1008 void
1009 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
1010     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1011 {
1012 	struct vm_create_params *vcp = &vmc->vmc_params;
1013 	int i;
1014 	uint64_t memlo, memhi;
1015 
1016 	/* Calculate memory size for NVRAM registers */
1017 	memlo = memhi = 0;
1018 	if (vcp->vcp_nmemranges > 2)
1019 		memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000;
1020 
1021 	if (vcp->vcp_nmemranges > 3)
1022 		memhi = vcp->vcp_memranges[3].vmr_size;
1023 
1024 	/* Reset the IO port map */
1025 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1026 
1027 	/* Init i8253 PIT */
1028 	i8253_init(vcp->vcp_id);
1029 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1030 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1031 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1032 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1033 	ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
1034 
1035 	/* Init mc146818 RTC */
1036 	mc146818_init(vcp->vcp_id, memlo, memhi);
1037 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1038 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1039 
1040 	/* Init master and slave PICs */
1041 	i8259_init();
1042 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1043 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1044 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1045 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1046 	ioports_map[ELCR0] = vcpu_exit_elcr;
1047 	ioports_map[ELCR1] = vcpu_exit_elcr;
1048 
1049 	/* Init ns8250 UART */
1050 	ns8250_init(con_fd, vcp->vcp_id);
1051 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1052 		ioports_map[i] = vcpu_exit_com;
1053 
1054 	/* Init QEMU fw_cfg interface */
1055 	fw_cfg_init(vmc);
1056 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1057 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1058 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1059 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1060 
1061 	/* Initialize PCI */
1062 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
1063 		ioports_map[i] = vcpu_exit_pci;
1064 
1065 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1066 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1067 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1068 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1069 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1070 	pci_init();
1071 
1072 	/* Initialize virtio devices */
1073 	virtio_init(current_vm, child_cdrom, child_disks, child_taps);
1074 }
1075 /*
1076  * restore_emulated_hw
1077  *
1078  * Restores the userspace hardware emulation from fd
1079  */
1080 void
1081 restore_emulated_hw(struct vm_create_params *vcp, int fd,
1082     int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
1083 {
1084 	/* struct vm_create_params *vcp = &vmc->vmc_params; */
1085 	int i;
1086 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1087 
1088 	/* Init i8253 PIT */
1089 	i8253_restore(fd, vcp->vcp_id);
1090 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1091 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1092 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1093 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1094 
1095 	/* Init master and slave PICs */
1096 	i8259_restore(fd);
1097 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1098 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1099 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1100 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1101 
1102 	/* Init ns8250 UART */
1103 	ns8250_restore(fd, con_fd, vcp->vcp_id);
1104 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1105 		ioports_map[i] = vcpu_exit_com;
1106 
1107 	/* Init mc146818 RTC */
1108 	mc146818_restore(fd, vcp->vcp_id);
1109 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1110 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1111 
1112 	/* Init QEMU fw_cfg interface */
1113 	fw_cfg_restore(fd);
1114 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1115 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1116 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1117 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1118 
1119 	/* Initialize PCI */
1120 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
1121 		ioports_map[i] = vcpu_exit_pci;
1122 
1123 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1124 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1125 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1126 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1127 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1128 	pci_restore(fd);
1129 	virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
1130 }
1131 
1132 /*
1133  * run_vm
1134  *
1135  * Runs the VM whose creation parameters are specified in vcp
1136  *
1137  * Parameters:
1138  *  child_cdrom: previously-opened child ISO disk file descriptor
1139  *  child_disks: previously-opened child VM disk file file descriptors
1140  *  child_taps: previously-opened child tap file descriptors
1141  *  vmc: vmop_create_params struct containing the VM's desired creation
1142  *      configuration
1143  *  vrs: VCPU register state to initialize
1144  *
1145  * Return values:
1146  *  0: the VM exited normally
1147  *  !0 : the VM exited abnormally or failed to start
1148  */
1149 int
1150 run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK],
1151     int *child_taps, struct vmop_create_params *vmc,
1152     struct vcpu_reg_state *vrs)
1153 {
1154 	struct vm_create_params *vcp = &vmc->vmc_params;
1155 	struct vm_rwregs_params vregsp;
1156 	uint8_t evdone = 0;
1157 	size_t i;
1158 	int ret;
1159 	pthread_t *tid, evtid;
1160 	struct vm_run_params **vrp;
1161 	void *exit_status;
1162 
1163 	if (vcp == NULL)
1164 		return (EINVAL);
1165 
1166 	if (child_cdrom == -1 && strlen(vcp->vcp_cdrom))
1167 		return (EINVAL);
1168 
1169 	if (child_disks == NULL && vcp->vcp_ndisks != 0)
1170 		return (EINVAL);
1171 
1172 	if (child_taps == NULL && vcp->vcp_nnics != 0)
1173 		return (EINVAL);
1174 
1175 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
1176 		return (EINVAL);
1177 
1178 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
1179 		return (EINVAL);
1180 
1181 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
1182 		return (EINVAL);
1183 
1184 	if (vcp->vcp_nmemranges == 0 ||
1185 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1186 		return (EINVAL);
1187 
1188 	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
1189 	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
1190 	if (tid == NULL || vrp == NULL) {
1191 		log_warn("%s: memory allocation error - exiting.",
1192 		    __progname);
1193 		return (ENOMEM);
1194 	}
1195 
1196 	log_debug("%s: initializing hardware for vm %s", __func__,
1197 	    vcp->vcp_name);
1198 
1199 	if (!(current_vm->vm_state & VM_STATE_RECEIVED))
1200 		init_emulated_hw(vmc, child_cdrom, child_disks, child_taps);
1201 
1202 	ret = pthread_mutex_init(&threadmutex, NULL);
1203 	if (ret) {
1204 		log_warn("%s: could not initialize thread state mutex",
1205 		    __func__);
1206 		return (ret);
1207 	}
1208 	ret = pthread_cond_init(&threadcond, NULL);
1209 	if (ret) {
1210 		log_warn("%s: could not initialize thread state "
1211 		    "condition variable", __func__);
1212 		return (ret);
1213 	}
1214 
1215 	mutex_lock(&threadmutex);
1216 
1217 	log_debug("%s: starting vcpu threads for vm %s", __func__,
1218 	    vcp->vcp_name);
1219 
1220 	/*
1221 	 * Create and launch one thread for each VCPU. These threads may
1222 	 * migrate between PCPUs over time; the need to reload CPU state
1223 	 * in such situations is detected and performed by vmm(4) in the
1224 	 * kernel.
1225 	 */
1226 	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
1227 		vrp[i] = malloc(sizeof(struct vm_run_params));
1228 		if (vrp[i] == NULL) {
1229 			log_warn("%s: memory allocation error - "
1230 			    "exiting.", __progname);
1231 			/* caller will exit, so skip freeing */
1232 			return (ENOMEM);
1233 		}
1234 		vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit));
1235 		if (vrp[i]->vrp_exit == NULL) {
1236 			log_warn("%s: memory allocation error - "
1237 			    "exiting.", __progname);
1238 			/* caller will exit, so skip freeing */
1239 			return (ENOMEM);
1240 		}
1241 		vrp[i]->vrp_vm_id = vcp->vcp_id;
1242 		vrp[i]->vrp_vcpu_id = i;
1243 
1244 		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
1245 			log_warnx("%s: cannot reset VCPU %zu - exiting.",
1246 			    __progname, i);
1247 			return (EIO);
1248 		}
1249 
1250 		/* once more because reset_cpu changes regs */
1251 		if (current_vm->vm_state & VM_STATE_RECEIVED) {
1252 			vregsp.vrwp_vm_id = vcp->vcp_id;
1253 			vregsp.vrwp_vcpu_id = i;
1254 			vregsp.vrwp_regs = *vrs;
1255 			vregsp.vrwp_mask = VM_RWREGS_ALL;
1256 			if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS,
1257 			    &vregsp)) == -1) {
1258 				log_warn("%s: writeregs failed", __func__);
1259 				return (ret);
1260 			}
1261 		}
1262 
1263 		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
1264 		if (ret) {
1265 			log_warnx("%s: cannot initialize cond var (%d)",
1266 			    __progname, ret);
1267 			return (ret);
1268 		}
1269 
1270 		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
1271 		if (ret) {
1272 			log_warnx("%s: cannot initialize mtx (%d)",
1273 			    __progname, ret);
1274 			return (ret);
1275 		}
1276 
1277 		ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL);
1278 		if (ret) {
1279 			log_warnx("%s: cannot initialize unpause var (%d)",
1280 			    __progname, ret);
1281 			return (ret);
1282 		}
1283 
1284 		ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL);
1285 		if (ret) {
1286 			log_warnx("%s: cannot initialize unpause mtx (%d)",
1287 			    __progname, ret);
1288 			return (ret);
1289 		}
1290 
1291 		vcpu_hlt[i] = 0;
1292 
1293 		/* Start each VCPU run thread at vcpu_run_loop */
1294 		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
1295 		if (ret) {
1296 			/* caller will _exit after this return */
1297 			ret = errno;
1298 			log_warn("%s: could not create vcpu thread %zu",
1299 			    __func__, i);
1300 			return (ret);
1301 		}
1302 	}
1303 
1304 	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
1305 	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
1306 	if (ret) {
1307 		errno = ret;
1308 		log_warn("%s: could not create event thread", __func__);
1309 		return (ret);
1310 	}
1311 
1312 	for (;;) {
1313 		ret = pthread_cond_wait(&threadcond, &threadmutex);
1314 		if (ret) {
1315 			log_warn("%s: waiting on thread state condition "
1316 			    "variable failed", __func__);
1317 			return (ret);
1318 		}
1319 
1320 		/*
1321 		 * Did a VCPU thread exit with an error? => return the first one
1322 		 */
1323 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1324 			if (vcpu_done[i] == 0)
1325 				continue;
1326 
1327 			if (pthread_join(tid[i], &exit_status)) {
1328 				log_warn("%s: failed to join thread %zd - "
1329 				    "exiting", __progname, i);
1330 				return (EIO);
1331 			}
1332 
1333 			ret = (intptr_t)exit_status;
1334 		}
1335 
1336 		/* Did the event thread exit? => return with an error */
1337 		if (evdone) {
1338 			if (pthread_join(evtid, &exit_status)) {
1339 				log_warn("%s: failed to join event thread - "
1340 				    "exiting", __progname);
1341 				return (EIO);
1342 			}
1343 
1344 			log_warnx("%s: vm %d event thread exited "
1345 			    "unexpectedly", __progname, vcp->vcp_id);
1346 			return (EIO);
1347 		}
1348 
1349 		/* Did all VCPU threads exit successfully? => return */
1350 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1351 			if (vcpu_done[i] == 0)
1352 				break;
1353 		}
1354 		if (i == vcp->vcp_ncpus)
1355 			return (ret);
1356 
1357 		/* Some more threads to wait for, start over */
1358 	}
1359 
1360 	return (ret);
1361 }
1362 
1363 void *
1364 event_thread(void *arg)
1365 {
1366 	uint8_t *donep = arg;
1367 	intptr_t ret;
1368 
1369 	ret = event_dispatch();
1370 
1371 	mutex_lock(&threadmutex);
1372 	*donep = 1;
1373 	pthread_cond_signal(&threadcond);
1374 	mutex_unlock(&threadmutex);
1375 
1376 	return (void *)ret;
1377  }
1378 
1379 /*
1380  * vcpu_run_loop
1381  *
1382  * Runs a single VCPU until vmm(4) requires help handling an exit,
1383  * or the VM terminates.
1384  *
1385  * Parameters:
1386  *  arg: vcpu_run_params for the VCPU being run by this thread
1387  *
1388  * Return values:
1389  *  NULL: the VCPU shutdown properly
1390  *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1391  */
1392 void *
1393 vcpu_run_loop(void *arg)
1394 {
1395 	struct vm_run_params *vrp = (struct vm_run_params *)arg;
1396 	intptr_t ret = 0;
1397 	int irq;
1398 	uint32_t n;
1399 
1400 	vrp->vrp_continue = 0;
1401 	n = vrp->vrp_vcpu_id;
1402 
1403 	for (;;) {
1404 		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1405 
1406 		if (ret) {
1407 			log_warnx("%s: can't lock vcpu run mtx (%d)",
1408 			    __func__, (int)ret);
1409 			return ((void *)ret);
1410 		}
1411 
1412 		/* If we are halted and need to pause, pause */
1413 		if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) {
1414 			ret = pthread_barrier_wait(&vm_pause_barrier);
1415 			if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
1416 				log_warnx("%s: could not wait on pause barrier (%d)",
1417 				    __func__, (int)ret);
1418 				return ((void *)ret);
1419 			}
1420 
1421 			ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]);
1422 			if (ret) {
1423 				log_warnx("%s: can't lock vcpu unpause mtx (%d)",
1424 				    __func__, (int)ret);
1425 				return ((void *)ret);
1426 			}
1427 
1428 			ret = pthread_cond_wait(&vcpu_unpause_cond[n],
1429 			    &vcpu_unpause_mtx[n]);
1430 			if (ret) {
1431 				log_warnx(
1432 				    "%s: can't wait on unpause cond (%d)",
1433 				    __func__, (int)ret);
1434 				break;
1435 			}
1436 			ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]);
1437 			if (ret) {
1438 				log_warnx("%s: can't unlock unpause mtx (%d)",
1439 				    __func__, (int)ret);
1440 				break;
1441 			}
1442 		}
1443 
1444 		/* If we are halted and not paused, wait */
1445 		if (vcpu_hlt[n]) {
1446 			ret = pthread_cond_wait(&vcpu_run_cond[n],
1447 			    &vcpu_run_mtx[n]);
1448 
1449 			if (ret) {
1450 				log_warnx(
1451 				    "%s: can't wait on cond (%d)",
1452 				    __func__, (int)ret);
1453 				(void)pthread_mutex_unlock(
1454 				    &vcpu_run_mtx[n]);
1455 				break;
1456 			}
1457 		}
1458 
1459 		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1460 
1461 		if (ret) {
1462 			log_warnx("%s: can't unlock mutex on cond (%d)",
1463 			    __func__, (int)ret);
1464 			break;
1465 		}
1466 
1467 		if (vrp->vrp_irqready && i8259_is_pending()) {
1468 			irq = i8259_ack();
1469 			vrp->vrp_irq = irq;
1470 		} else
1471 			vrp->vrp_irq = 0xFFFF;
1472 
1473 		/* Still more pending? */
1474 		if (i8259_is_pending()) {
1475 			/*
1476 			 * XXX can probably avoid ioctls here by providing intr
1477 			 * in vrp
1478 			 */
1479 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1480 			    vrp->vrp_vcpu_id, 1)) {
1481 				fatal("can't set INTR");
1482 			}
1483 		} else {
1484 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1485 			    vrp->vrp_vcpu_id, 0)) {
1486 				fatal("can't clear INTR");
1487 			}
1488 		}
1489 
1490 		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) {
1491 			/* If run ioctl failed, exit */
1492 			ret = errno;
1493 			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1494 			    __func__, vrp->vrp_vm_id, n);
1495 			break;
1496 		}
1497 
1498 		/* If the VM is terminating, exit normally */
1499 		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
1500 			ret = (intptr_t)NULL;
1501 			break;
1502 		}
1503 
1504 		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
1505 			/*
1506 			 * vmm(4) needs help handling an exit, handle in
1507 			 * vcpu_exit.
1508 			 */
1509 			ret = vcpu_exit(vrp);
1510 			if (ret)
1511 				break;
1512 		}
1513 	}
1514 
1515 	mutex_lock(&threadmutex);
1516 	vcpu_done[n] = 1;
1517 	pthread_cond_signal(&threadcond);
1518 	mutex_unlock(&threadmutex);
1519 
1520 	return ((void *)ret);
1521 }
1522 
1523 int
1524 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1525 {
1526 	struct vm_intr_params vip;
1527 
1528 	memset(&vip, 0, sizeof(vip));
1529 
1530 	vip.vip_vm_id = vm_id;
1531 	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1532 	vip.vip_intr = intr;
1533 
1534 	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1)
1535 		return (errno);
1536 
1537 	return (0);
1538 }
1539 
1540 /*
1541  * vcpu_exit_pci
1542  *
1543  * Handle all I/O to the emulated PCI subsystem.
1544  *
1545  * Parameters:
1546  *  vrp: vcpu run paramters containing guest state for this exit
1547  *
1548  * Return value:
1549  *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
1550  *      be injected.
1551  */
1552 uint8_t
1553 vcpu_exit_pci(struct vm_run_params *vrp)
1554 {
1555 	struct vm_exit *vei = vrp->vrp_exit;
1556 	uint8_t intr;
1557 
1558 	intr = 0xFF;
1559 
1560 	switch (vei->vei.vei_port) {
1561 	case PCI_MODE1_ADDRESS_REG:
1562 		pci_handle_address_reg(vrp);
1563 		break;
1564 	case PCI_MODE1_DATA_REG:
1565 	case PCI_MODE1_DATA_REG + 1:
1566 	case PCI_MODE1_DATA_REG + 2:
1567 	case PCI_MODE1_DATA_REG + 3:
1568 		pci_handle_data_reg(vrp);
1569 		break;
1570 	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
1571 		intr = pci_handle_io(vrp);
1572 		break;
1573 	default:
1574 		log_warnx("%s: unknown PCI register 0x%llx",
1575 		    __progname, (uint64_t)vei->vei.vei_port);
1576 		break;
1577 	}
1578 
1579 	return (intr);
1580 }
1581 
1582 /*
1583  * vcpu_exit_inout
1584  *
1585  * Handle all I/O exits that need to be emulated in vmd. This includes the
1586  * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
1587  *
1588  * Parameters:
1589  *  vrp: vcpu run parameters containing guest state for this exit
1590  */
1591 void
1592 vcpu_exit_inout(struct vm_run_params *vrp)
1593 {
1594 	struct vm_exit *vei = vrp->vrp_exit;
1595 	uint8_t intr = 0xFF;
1596 
1597 	if (ioports_map[vei->vei.vei_port] != NULL)
1598 		intr = ioports_map[vei->vei.vei_port](vrp);
1599 	else if (vei->vei.vei_dir == VEI_DIR_IN)
1600 			set_return_data(vei, 0xFFFFFFFF);
1601 
1602 	if (intr != 0xFF)
1603 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
1604 }
1605 
1606 /*
1607  * vcpu_exit_eptviolation
1608  *
1609  * handle an EPT Violation
1610  *
1611  * Parameters:
1612  *  vrp: vcpu run parameters containing guest state for this exit
1613  *
1614  * Return values:
1615  *  0: no action required
1616  *  EAGAIN: a protection fault occured, kill the vm.
1617  */
1618 int
1619 vcpu_exit_eptviolation(struct vm_run_params *vrp)
1620 {
1621 	struct vm_exit *ve = vrp->vrp_exit;
1622 
1623 	/*
1624 	 * vmd may be exiting to vmd to handle a pending interrupt
1625 	 * but last exit type may have been VMX_EXIT_EPT_VIOLATION,
1626 	 * check the fault_type to ensure we really are processing
1627 	 * a VMX_EXIT_EPT_VIOLATION.
1628 	 */
1629 	if (ve->vee.vee_fault_type == VEE_FAULT_PROTECT) {
1630 		log_debug("%s: EPT Violation: rip=0x%llx",
1631 		    __progname, vrp->vrp_exit->vrs.vrs_gprs[VCPU_REGS_RIP]);
1632 		return (EAGAIN);
1633 	}
1634 
1635 	return (0);
1636 }
1637 
1638 /*
1639  * vcpu_exit
1640  *
1641  * Handle a vcpu exit. This function is called when it is determined that
1642  * vmm(4) requires the assistance of vmd to support a particular guest
1643  * exit type (eg, accessing an I/O port or device). Guest state is contained
1644  * in 'vrp', and will be resent to vmm(4) on exit completion.
1645  *
1646  * Upon conclusion of handling the exit, the function determines if any
1647  * interrupts should be injected into the guest, and asserts the proper
1648  * IRQ line whose interrupt should be vectored.
1649  *
1650  * Parameters:
1651  *  vrp: vcpu run parameters containing guest state for this exit
1652  *
1653  * Return values:
1654  *  0: the exit was handled successfully
1655  *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
1656  */
1657 int
1658 vcpu_exit(struct vm_run_params *vrp)
1659 {
1660 	int ret;
1661 
1662 	switch (vrp->vrp_exit_reason) {
1663 	case VMX_EXIT_INT_WINDOW:
1664 	case SVM_VMEXIT_VINTR:
1665 	case VMX_EXIT_CPUID:
1666 	case VMX_EXIT_EXTINT:
1667 	case SVM_VMEXIT_INTR:
1668 	case SVM_VMEXIT_NPF:
1669 	case SVM_VMEXIT_MSR:
1670 	case SVM_VMEXIT_CPUID:
1671 		/*
1672 		 * We may be exiting to vmd to handle a pending interrupt but
1673 		 * at the same time the last exit type may have been one of
1674 		 * these. In this case, there's nothing extra to be done
1675 		 * here (and falling through to the default case below results
1676 		 * in more vmd log spam).
1677 		 */
1678 		break;
1679 	case VMX_EXIT_EPT_VIOLATION:
1680 		ret = vcpu_exit_eptviolation(vrp);
1681 		if (ret)
1682 			return (ret);
1683 
1684 		break;
1685 	case VMX_EXIT_IO:
1686 	case SVM_VMEXIT_IOIO:
1687 		vcpu_exit_inout(vrp);
1688 		break;
1689 	case VMX_EXIT_HLT:
1690 	case SVM_VMEXIT_HLT:
1691 		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1692 		if (ret) {
1693 			log_warnx("%s: can't lock vcpu mutex (%d)",
1694 			    __func__, ret);
1695 			return (ret);
1696 		}
1697 		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
1698 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1699 		if (ret) {
1700 			log_warnx("%s: can't unlock vcpu mutex (%d)",
1701 			    __func__, ret);
1702 			return (ret);
1703 		}
1704 		break;
1705 	case VMX_EXIT_TRIPLE_FAULT:
1706 	case SVM_VMEXIT_SHUTDOWN:
1707 		/* reset VM */
1708 		return (EAGAIN);
1709 	default:
1710 		log_debug("%s: unknown exit reason 0x%x",
1711 		    __progname, vrp->vrp_exit_reason);
1712 	}
1713 
1714 	vrp->vrp_continue = 1;
1715 
1716 	return (0);
1717 }
1718 
1719 /*
1720  * find_gpa_range
1721  *
1722  * Search for a contiguous guest physical mem range.
1723  *
1724  * Parameters:
1725  *  vcp: VM create parameters that contain the memory map to search in
1726  *  gpa: the starting guest physical address
1727  *  len: the length of the memory range
1728  *
1729  * Return values:
1730  *  NULL: on failure if there is no memory range as described by the parameters
1731  *  Pointer to vm_mem_range that contains the start of the range otherwise.
1732  */
1733 static struct vm_mem_range *
1734 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
1735 {
1736 	size_t i, n;
1737 	struct vm_mem_range *vmr;
1738 
1739 	/* Find the first vm_mem_range that contains gpa */
1740 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1741 		vmr = &vcp->vcp_memranges[i];
1742 		if (vmr->vmr_gpa + vmr->vmr_size >= gpa)
1743 			break;
1744 	}
1745 
1746 	/* No range found. */
1747 	if (i == vcp->vcp_nmemranges)
1748 		return (NULL);
1749 
1750 	/*
1751 	 * vmr may cover the range [gpa, gpa + len) only partly. Make
1752 	 * sure that the following vm_mem_ranges are contiguous and
1753 	 * cover the rest.
1754 	 */
1755 	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
1756 	if (len < n)
1757 		len = 0;
1758 	else
1759 		len -= n;
1760 	gpa = vmr->vmr_gpa + vmr->vmr_size;
1761 	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
1762 		vmr = &vcp->vcp_memranges[i];
1763 		if (gpa != vmr->vmr_gpa)
1764 			return (NULL);
1765 		if (len <= vmr->vmr_size)
1766 			len = 0;
1767 		else
1768 			len -= vmr->vmr_size;
1769 
1770 		gpa = vmr->vmr_gpa + vmr->vmr_size;
1771 	}
1772 
1773 	if (len != 0)
1774 		return (NULL);
1775 
1776 	return (vmr);
1777 }
1778 
1779 /*
1780  * write_mem
1781  *
1782  * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
1783  *
1784  * Parameters:
1785  *  dst: the destination paddr_t in the guest VM
1786  *  buf: data to copy (or NULL to zero the data)
1787  *  len: number of bytes to copy
1788  *
1789  * Return values:
1790  *  0: success
1791  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1792  *      exist in the guest.
1793  */
1794 int
1795 write_mem(paddr_t dst, const void *buf, size_t len)
1796 {
1797 	const char *from = buf;
1798 	char *to;
1799 	size_t n, off;
1800 	struct vm_mem_range *vmr;
1801 
1802 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
1803 	if (vmr == NULL) {
1804 		errno = EINVAL;
1805 		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
1806 		    "len = 0x%zx", __func__, dst, len);
1807 		return (EINVAL);
1808 	}
1809 
1810 	off = dst - vmr->vmr_gpa;
1811 	while (len != 0) {
1812 		n = vmr->vmr_size - off;
1813 		if (len < n)
1814 			n = len;
1815 
1816 		to = (char *)vmr->vmr_va + off;
1817 		if (buf == NULL)
1818 			memset(to, 0, n);
1819 		else {
1820 			memcpy(to, from, n);
1821 			from += n;
1822 		}
1823 		len -= n;
1824 		off = 0;
1825 		vmr++;
1826 	}
1827 
1828 	return (0);
1829 }
1830 
1831 /*
1832  * read_mem
1833  *
1834  * Reads memory at guest paddr 'src' into 'buf'.
1835  *
1836  * Parameters:
1837  *  src: the source paddr_t in the guest VM to read from.
1838  *  buf: destination (local) buffer
1839  *  len: number of bytes to read
1840  *
1841  * Return values:
1842  *  0: success
1843  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1844  *      exist in the guest.
1845  */
1846 int
1847 read_mem(paddr_t src, void *buf, size_t len)
1848 {
1849 	char *from, *to = buf;
1850 	size_t n, off;
1851 	struct vm_mem_range *vmr;
1852 
1853 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
1854 	if (vmr == NULL) {
1855 		errno = EINVAL;
1856 		log_warn("%s: failed - invalid memory range src = 0x%lx, "
1857 		    "len = 0x%zx", __func__, src, len);
1858 		return (EINVAL);
1859 	}
1860 
1861 	off = src - vmr->vmr_gpa;
1862 	while (len != 0) {
1863 		n = vmr->vmr_size - off;
1864 		if (len < n)
1865 			n = len;
1866 
1867 		from = (char *)vmr->vmr_va + off;
1868 		memcpy(to, from, n);
1869 
1870 		to += n;
1871 		len -= n;
1872 		off = 0;
1873 		vmr++;
1874 	}
1875 
1876 	return (0);
1877 }
1878 
1879 /*
1880  * vcpu_assert_pic_irq
1881  *
1882  * Injects the specified IRQ on the supplied vcpu/vm
1883  *
1884  * Parameters:
1885  *  vm_id: VM ID to inject to
1886  *  vcpu_id: VCPU ID to inject to
1887  *  irq: IRQ to inject
1888  */
1889 void
1890 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1891 {
1892 	int ret;
1893 
1894 	i8259_assert_irq(irq);
1895 
1896 	if (i8259_is_pending()) {
1897 		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
1898 			fatalx("%s: can't assert INTR", __func__);
1899 
1900 		ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
1901 		if (ret)
1902 			fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
1903 
1904 		vcpu_hlt[vcpu_id] = 0;
1905 		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
1906 		if (ret)
1907 			fatalx("%s: can't signal (%d)", __func__, ret);
1908 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
1909 		if (ret)
1910 			fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
1911 	}
1912 }
1913 
1914 /*
1915  * vcpu_deassert_pic_irq
1916  *
1917  * Clears the specified IRQ on the supplied vcpu/vm
1918  *
1919  * Parameters:
1920  *  vm_id: VM ID to clear in
1921  *  vcpu_id: VCPU ID to clear in
1922  *  irq: IRQ to clear
1923  */
1924 void
1925 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1926 {
1927 	i8259_deassert_irq(irq);
1928 
1929 	if (!i8259_is_pending()) {
1930 		if (vcpu_pic_intr(vm_id, vcpu_id, 0))
1931 			fatalx("%s: can't deassert INTR for vm_id %d, "
1932 			    "vcpu_id %d", __func__, vm_id, vcpu_id);
1933 	}
1934 }
1935 
1936 /*
1937  * fd_hasdata
1938  *
1939  * Determines if data can be read from a file descriptor.
1940  *
1941  * Parameters:
1942  *  fd: the fd to check
1943  *
1944  * Return values:
1945  *  1 if data can be read from an fd, or 0 otherwise.
1946  */
1947 int
1948 fd_hasdata(int fd)
1949 {
1950 	struct pollfd pfd[1];
1951 	int nready, hasdata = 0;
1952 
1953 	pfd[0].fd = fd;
1954 	pfd[0].events = POLLIN;
1955 	nready = poll(pfd, 1, 0);
1956 	if (nready == -1)
1957 		log_warn("checking file descriptor for data failed");
1958 	else if (nready == 1 && pfd[0].revents & POLLIN)
1959 		hasdata = 1;
1960 	return (hasdata);
1961 }
1962 
1963 /*
1964  * mutex_lock
1965  *
1966  * Wrapper function for pthread_mutex_lock that does error checking and that
1967  * exits on failure
1968  */
1969 void
1970 mutex_lock(pthread_mutex_t *m)
1971 {
1972 	int ret;
1973 
1974 	ret = pthread_mutex_lock(m);
1975 	if (ret) {
1976 		errno = ret;
1977 		fatal("could not acquire mutex");
1978 	}
1979 }
1980 
1981 /*
1982  * mutex_unlock
1983  *
1984  * Wrapper function for pthread_mutex_unlock that does error checking and that
1985  * exits on failure
1986  */
1987 void
1988 mutex_unlock(pthread_mutex_t *m)
1989 {
1990 	int ret;
1991 
1992 	ret = pthread_mutex_unlock(m);
1993 	if (ret) {
1994 		errno = ret;
1995 		fatal("could not release mutex");
1996 	}
1997 }
1998 
1999 /*
2000  * set_return_data
2001  *
2002  * Utility function for manipulating register data in vm exit info structs. This
2003  * function ensures that the data is copied to the vei->vei.vei_data field with
2004  * the proper size for the operation being performed.
2005  *
2006  * Parameters:
2007  *  vei: exit information
2008  *  data: return data
2009  */
2010 void
2011 set_return_data(struct vm_exit *vei, uint32_t data)
2012 {
2013 	switch (vei->vei.vei_size) {
2014 	case 1:
2015 		vei->vei.vei_data &= ~0xFF;
2016 		vei->vei.vei_data |= (uint8_t)data;
2017 		break;
2018 	case 2:
2019 		vei->vei.vei_data &= ~0xFFFF;
2020 		vei->vei.vei_data |= (uint16_t)data;
2021 		break;
2022 	case 4:
2023 		vei->vei.vei_data = data;
2024 		break;
2025 	}
2026 }
2027 
2028 /*
2029  * get_input_data
2030  *
2031  * Utility function for manipulating register data in vm exit info
2032  * structs. This function ensures that the data is copied from the
2033  * vei->vei.vei_data field with the proper size for the operation being
2034  * performed.
2035  *
2036  * Parameters:
2037  *  vei: exit information
2038  *  data: location to store the result
2039  */
2040 void
2041 get_input_data(struct vm_exit *vei, uint32_t *data)
2042 {
2043 	switch (vei->vei.vei_size) {
2044 	case 1:
2045 		*data &= 0xFFFFFF00;
2046 		*data |= (uint8_t)vei->vei.vei_data;
2047 		break;
2048 	case 2:
2049 		*data &= 0xFFFF0000;
2050 		*data |= (uint16_t)vei->vei.vei_data;
2051 		break;
2052 	case 4:
2053 		*data = vei->vei.vei_data;
2054 		break;
2055 	default:
2056 		log_warnx("%s: invalid i/o size %d", __func__,
2057 		    vei->vei.vei_size);
2058 	}
2059 
2060 }
2061 
2062 /*
2063  * translate_gva
2064  *
2065  * Translates a guest virtual address to a guest physical address by walking
2066  * the currently active page table (if needed).
2067  *
2068  * Note - this function can possibly alter the supplied VCPU state.
2069  *  Specifically, it may inject exceptions depending on the current VCPU
2070  *  configuration, and may alter %cr2 on #PF. Consequently, this function
2071  *  should only be used as part of instruction emulation.
2072  *
2073  * Parameters:
2074  *  exit: The VCPU this translation should be performed for (guest MMU settings
2075  *   are gathered from this VCPU)
2076  *  va: virtual address to translate
2077  *  pa: pointer to paddr_t variable that will receive the translated physical
2078  *   address. 'pa' is unchanged on error.
2079  *  mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
2080  *   the address should be translated
2081  *
2082  * Return values:
2083  *  0: the address was successfully translated - 'pa' contains the physical
2084  *     address currently mapped by 'va'.
2085  *  EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
2086  *     and %cr2 set in the vcpu structure.
2087  *  EINVAL: an error occurred reading paging table structures
2088  */
2089 int
2090 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
2091 {
2092 	int level, shift, pdidx;
2093 	uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
2094 	uint64_t shift_width, pte_size;
2095 	struct vcpu_reg_state *vrs;
2096 
2097 	vrs = &exit->vrs;
2098 
2099 	if (!pa)
2100 		return (EINVAL);
2101 
2102 	if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
2103 		log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
2104 		*pa = va;
2105 		return (0);
2106 	}
2107 
2108 	pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
2109 
2110 	log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
2111 	    vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
2112 
2113 	if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
2114 		if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
2115 			pte_size = sizeof(uint64_t);
2116 			shift_width = 9;
2117 
2118 			if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
2119 				/* 4 level paging */
2120 				level = 4;
2121 				mask = L4_MASK;
2122 				shift = L4_SHIFT;
2123 			} else {
2124 				/* 32 bit with PAE paging */
2125 				level = 3;
2126 				mask = L3_MASK;
2127 				shift = L3_SHIFT;
2128 			}
2129 		} else {
2130 			/* 32 bit paging */
2131 			level = 2;
2132 			shift_width = 10;
2133 			mask = 0xFFC00000;
2134 			shift = 22;
2135 			pte_size = sizeof(uint32_t);
2136 		}
2137 	} else
2138 		return (EINVAL);
2139 
2140 	/* XXX: Check for R bit in segment selector and set A bit */
2141 
2142 	for (;level > 0; level--) {
2143 		pdidx = (va & mask) >> shift;
2144 		pte_paddr = (pt_paddr) + (pdidx * pte_size);
2145 
2146 		log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
2147 		    level, pte_paddr);
2148 		if (read_mem(pte_paddr, &pte, pte_size)) {
2149 			log_warn("%s: failed to read pte", __func__);
2150 			return (EFAULT);
2151 		}
2152 
2153 		log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
2154 		    pte);
2155 
2156 		/* XXX: Set CR2  */
2157 		if (!(pte & PG_V))
2158 			return (EFAULT);
2159 
2160 		/* XXX: Check for SMAP */
2161 		if ((mode == PROT_WRITE) && !(pte & PG_RW))
2162 			return (EPERM);
2163 
2164 		if ((exit->cpl > 0) && !(pte & PG_u))
2165 			return (EPERM);
2166 
2167 		pte = pte | PG_U;
2168 		if (mode == PROT_WRITE)
2169 			pte = pte | PG_M;
2170 		if (write_mem(pte_paddr, &pte, pte_size)) {
2171 			log_warn("%s: failed to write back flags to pte",
2172 			    __func__);
2173 			return (EIO);
2174 		}
2175 
2176 		/* XXX: EINVAL if in 32bit and  PG_PS is 1 but CR4.PSE is 0 */
2177 		if (pte & PG_PS)
2178 			break;
2179 
2180 		if (level > 1) {
2181 			pt_paddr = pte & PG_FRAME;
2182 			shift -= shift_width;
2183 			mask = mask >> shift_width;
2184 		}
2185 	}
2186 
2187 	low_mask = (1 << shift) - 1;
2188 	high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
2189 	*pa = (pte & high_mask) | (va & low_mask);
2190 
2191 	log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
2192 
2193 	return (0);
2194 }
2195 
2196 /*
2197  * vm_pipe_init
2198  *
2199  * Initialize a vm_dev_pipe, setting up its file descriptors and its
2200  * event structure with the given callback.
2201  *
2202  * Parameters:
2203  *  p: pointer to vm_dev_pipe struct to initizlize
2204  *  cb: callback to use for READ events on the read end of the pipe
2205  */
2206 void
2207 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *))
2208 {
2209 	int ret;
2210 	int fds[2];
2211 
2212 	memset(p, 0, sizeof(struct vm_dev_pipe));
2213 
2214 	ret = pipe(fds);
2215 	if (ret)
2216 		fatal("failed to create vm_dev_pipe pipe");
2217 
2218 	p->read = fds[0];
2219 	p->write = fds[1];
2220 
2221 	event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL);
2222 }
2223 
2224 /*
2225  * vm_pipe_send
2226  *
2227  * Send a message to an emulated device vie the provided vm_dev_pipe.
2228  *
2229  * Parameters:
2230  *  p: pointer to initialized vm_dev_pipe
2231  *  msg: message to send in the channel
2232  */
2233 void
2234 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg)
2235 {
2236 	size_t n;
2237 	n = write(p->write, &msg, sizeof(msg));
2238 	if (n != sizeof(msg))
2239 		fatal("failed to write to device pipe");
2240 }
2241 
2242 /*
2243  * vm_pipe_recv
2244  *
2245  * Receive a message for an emulated device via the provided vm_dev_pipe.
2246  * Returns the message value, otherwise will exit on failure.
2247  *
2248  * Parameters:
2249  *  p: pointer to initialized vm_dev_pipe
2250  *
2251  * Return values:
2252  *  a value of enum pipe_msg_type or fatal exit on read(2) error
2253  */
2254 enum pipe_msg_type
2255 vm_pipe_recv(struct vm_dev_pipe *p)
2256 {
2257 	size_t n;
2258 	enum pipe_msg_type msg;
2259 	n = read(p->read, &msg, sizeof(msg));
2260 	if (n != sizeof(msg))
2261 		fatal("failed to read from device pipe");
2262 
2263 	return msg;
2264 }
2265