xref: /openbsd-src/usr.sbin/vmd/vm.c (revision 99fd087599a8791921855f21bd7e36130f39aadc)
1 /*	$OpenBSD: vm.c,v 1.54 2019/12/11 06:45:16 pd Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/ioctl.h>
21 #include <sys/queue.h>
22 #include <sys/wait.h>
23 #include <sys/uio.h>
24 #include <sys/socket.h>
25 #include <sys/time.h>
26 #include <sys/mman.h>
27 
28 #include <dev/ic/i8253reg.h>
29 #include <dev/isa/isareg.h>
30 #include <dev/pci/pcireg.h>
31 
32 #include <machine/param.h>
33 #include <machine/psl.h>
34 #include <machine/pte.h>
35 #include <machine/specialreg.h>
36 #include <machine/vmmvar.h>
37 
38 #include <net/if.h>
39 
40 #include <errno.h>
41 #include <event.h>
42 #include <fcntl.h>
43 #include <imsg.h>
44 #include <limits.h>
45 #include <poll.h>
46 #include <pthread.h>
47 #include <stddef.h>
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include <unistd.h>
52 #include <util.h>
53 
54 #include "vmd.h"
55 #include "vmm.h"
56 #include "loadfile.h"
57 #include "pci.h"
58 #include "virtio.h"
59 #include "proc.h"
60 #include "i8253.h"
61 #include "i8259.h"
62 #include "ns8250.h"
63 #include "mc146818.h"
64 #include "fw_cfg.h"
65 #include "atomicio.h"
66 
67 io_fn_t ioports_map[MAX_PORTS];
68 
69 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
70     struct vmop_create_params *, struct vcpu_reg_state *);
71 void vm_dispatch_vmm(int, short, void *);
72 void *event_thread(void *);
73 void *vcpu_run_loop(void *);
74 int vcpu_exit(struct vm_run_params *);
75 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
76 void create_memory_map(struct vm_create_params *);
77 int alloc_guest_mem(struct vm_create_params *);
78 int vmm_create_vm(struct vm_create_params *);
79 void init_emulated_hw(struct vmop_create_params *, int,
80     int[][VM_MAX_BASE_PER_DISK], int *);
81 void restore_emulated_hw(struct vm_create_params *, int, int *,
82     int[][VM_MAX_BASE_PER_DISK],int);
83 void vcpu_exit_inout(struct vm_run_params *);
84 uint8_t vcpu_exit_pci(struct vm_run_params *);
85 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
86 int loadfile_bios(FILE *, struct vcpu_reg_state *);
87 int send_vm(int, struct vm_create_params *);
88 int dump_send_header(int);
89 int dump_vmr(int , struct vm_mem_range *);
90 int dump_mem(int, struct vm_create_params *);
91 void restore_vmr(int, struct vm_mem_range *);
92 void restore_mem(int, struct vm_create_params *);
93 int restore_vm_params(int, struct vm_create_params *);
94 void pause_vm(struct vm_create_params *);
95 void unpause_vm(struct vm_create_params *);
96 
97 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
98 
99 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
100     size_t);
101 
102 int con_fd;
103 struct vmd_vm *current_vm;
104 
105 extern struct vmd *env;
106 
107 extern char *__progname;
108 
109 pthread_mutex_t threadmutex;
110 pthread_cond_t threadcond;
111 
112 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
113 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
114 pthread_cond_t vcpu_pause_cond[VMM_MAX_VCPUS_PER_VM];
115 pthread_mutex_t vcpu_pause_mtx[VMM_MAX_VCPUS_PER_VM];
116 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM];
117 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM];
118 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
119 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
120 
121 /*
122  * Represents a standard register set for an OS to be booted
123  * as a flat 64 bit address space.
124  *
125  * NOT set here are:
126  *  RIP
127  *  RSP
128  *  GDTR BASE
129  *
130  * Specific bootloaders should clone this structure and override
131  * those fields as needed.
132  *
133  * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
134  *        features of the CPU in use.
135  */
136 static const struct vcpu_reg_state vcpu_init_flat64 = {
137 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
138 	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
139 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
140 	.vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
141 	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
142 	.vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
143 	.vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
144 	.vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
145 	.vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
146 	.vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
147 	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
148 	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
149 	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
150 	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
151 	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
152 	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
153 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
154 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
155 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
156 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
157 	.vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
158 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
159 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
160 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
161 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
162 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
163 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
164 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
165 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
166 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
167 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
168 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
169 	.vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
170 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
171 };
172 
173 /*
174  * Represents a standard register set for an BIOS to be booted
175  * as a flat 16 bit address space.
176  */
177 static const struct vcpu_reg_state vcpu_init_flat16 = {
178 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
179 	.vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
180 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
181 	.vrs_crs[VCPU_REGS_CR0] = 0x60000010,
182 	.vrs_crs[VCPU_REGS_CR3] = 0,
183 	.vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
184 	.vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
185 	.vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
186 	.vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
187 	.vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
188 	.vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
189 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
190 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
191 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
192 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
193 	.vrs_msrs[VCPU_REGS_EFER] = 0ULL,
194 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
195 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
196 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
197 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
198 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
199 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
200 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
201 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
202 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
203 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
204 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
205 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
206 };
207 
208 /*
209  * loadfile_bios
210  *
211  * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
212  * directly into memory.
213  *
214  * Parameters:
215  *  fp: file of a kernel file to load
216  *  (out) vrs: register state to set on init for this kernel
217  *
218  * Return values:
219  *  0 if successful
220  *  various error codes returned from read(2) or loadelf functions
221  */
222 int
223 loadfile_bios(FILE *fp, struct vcpu_reg_state *vrs)
224 {
225 	off_t	 size, off;
226 
227 	/* Set up a "flat 16 bit" register state for BIOS */
228 	memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
229 
230 	/* Get the size of the BIOS image and seek to the beginning */
231 	if (fseeko(fp, 0, SEEK_END) == -1 || (size = ftello(fp)) == -1 ||
232 	    fseeko(fp, 0, SEEK_SET) == -1)
233 		return (-1);
234 
235 	/* The BIOS image must end at 1M */
236 	if ((off = 1048576 - size) < 0)
237 		return (-1);
238 
239 	/* Read BIOS image into memory */
240 	if (mread(fp, off, size) != (size_t)size) {
241 		errno = EIO;
242 		return (-1);
243 	}
244 
245 	log_debug("%s: loaded BIOS image", __func__);
246 
247 	return (0);
248 }
249 
250 /*
251  * start_vm
252  *
253  * After forking a new VM process, starts the new VM with the creation
254  * parameters supplied (in the incoming vm->vm_params field). This
255  * function performs a basic sanity check on the incoming parameters
256  * and then performs the following steps to complete the creation of the VM:
257  *
258  * 1. validates and create the new VM
259  * 2. opens the imsg control channel to the parent and drops more privilege
260  * 3. drops additional privleges by calling pledge(2)
261  * 4. loads the kernel from the disk image or file descriptor
262  * 5. runs the VM's VCPU loops.
263  *
264  * Parameters:
265  *  vm: The VM data structure that is including the VM create parameters.
266  *  fd: The imsg socket that is connected to the parent process.
267  *
268  * Return values:
269  *  0: success
270  *  !0 : failure - typically an errno indicating the source of the failure
271  */
272 int
273 start_vm(struct vmd_vm *vm, int fd)
274 {
275 	struct vmop_create_params *vmc = &vm->vm_params;
276 	struct vm_create_params	*vcp = &vmc->vmc_params;
277 	struct vcpu_reg_state	 vrs;
278 	int			 nicfds[VMM_MAX_NICS_PER_VM];
279 	int			 ret;
280 	FILE			*fp;
281 	struct vmboot_params	 vmboot;
282 	size_t			 i;
283 	struct vm_rwregs_params  vrp;
284 
285 	/* Child */
286 	setproctitle("%s", vcp->vcp_name);
287 	log_procinit(vcp->vcp_name);
288 
289 	if (!(vm->vm_state & VM_STATE_RECEIVED))
290 		create_memory_map(vcp);
291 
292 	ret = alloc_guest_mem(vcp);
293 
294 	if (ret) {
295 		errno = ret;
296 		fatal("could not allocate guest memory - exiting");
297 	}
298 
299 	ret = vmm_create_vm(vcp);
300 	current_vm = vm;
301 
302 	/* send back the kernel-generated vm id (0 on error) */
303 	if (write(fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
304 	    sizeof(vcp->vcp_id))
305 		fatal("write vcp id");
306 
307 	if (ret) {
308 		errno = ret;
309 		fatal("create vmm ioctl failed - exiting");
310 	}
311 
312 	/*
313 	 * pledge in the vm processes:
314 	 * stdio - for malloc and basic I/O including events.
315 	 * recvfd - for send/recv.
316 	 * vmm - for the vmm ioctls and operations.
317 	 */
318 	if (pledge("stdio vmm recvfd", NULL) == -1)
319 		fatal("pledge");
320 
321 	if (vm->vm_state & VM_STATE_RECEIVED) {
322 		ret = read(vm->vm_receive_fd, &vrp, sizeof(vrp));
323 		if (ret != sizeof(vrp)) {
324 			fatal("received incomplete vrp - exiting");
325 		}
326 		vrs = vrp.vrwp_regs;
327 	} else {
328 		/*
329 		 * Set up default "flat 64 bit" register state - RIP,
330 		 * RSP, and GDT info will be set in bootloader
331 		 */
332 		memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs));
333 
334 		/* Find and open kernel image */
335 		if ((fp = vmboot_open(vm->vm_kernel,
336 		    vm->vm_disks[0], vmc->vmc_diskbases[0],
337 		    vmc->vmc_disktypes[0], &vmboot)) == NULL)
338 			fatalx("failed to open kernel - exiting");
339 
340 		/* Load kernel image */
341 		ret = loadfile_elf(fp, vcp, &vrs,
342 		    vmboot.vbp_bootdev, vmboot.vbp_howto, vmc->vmc_bootdevice);
343 
344 		/*
345 		 * Try BIOS as a fallback (only if it was provided as an image
346 		 * with vm->vm_kernel and not loaded from the disk)
347 		 */
348 		if (ret && errno == ENOEXEC && vm->vm_kernel != -1)
349 			ret = loadfile_bios(fp, &vrs);
350 
351 		if (ret)
352 			fatal("failed to load kernel or BIOS - exiting");
353 
354 		vmboot_close(fp, &vmboot);
355 	}
356 
357 	if (vm->vm_kernel != -1)
358 		close(vm->vm_kernel);
359 
360 	con_fd = vm->vm_tty;
361 	if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
362 		fatal("failed to set nonblocking mode on console");
363 
364 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
365 		nicfds[i] = vm->vm_ifs[i].vif_fd;
366 
367 	event_init();
368 
369 	if (vm->vm_state & VM_STATE_RECEIVED) {
370 		restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
371 		    vm->vm_disks, vm->vm_cdrom);
372 		restore_mem(vm->vm_receive_fd, vcp);
373 		if (restore_vm_params(vm->vm_receive_fd, vcp))
374 			fatal("restore vm params failed");
375 		unpause_vm(vcp);
376 	}
377 
378 	if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
379 		fatal("setup vm pipe");
380 
381 	/* Execute the vcpu run loop(s) for this VM */
382 	ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs);
383 
384 	/* Ensure that any in-flight data is written back */
385 	virtio_shutdown(vm);
386 
387 	return (ret);
388 }
389 
390 /*
391  * vm_dispatch_vmm
392  *
393  * imsg callback for messages that are received from the vmm parent process.
394  */
395 void
396 vm_dispatch_vmm(int fd, short event, void *arg)
397 {
398 	struct vmd_vm		*vm = arg;
399 	struct vmop_result	 vmr;
400 	struct imsgev		*iev = &vm->vm_iev;
401 	struct imsgbuf		*ibuf = &iev->ibuf;
402 	struct imsg		 imsg;
403 	ssize_t			 n;
404 	int			 verbose;
405 
406 	if (event & EV_READ) {
407 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
408 			fatal("%s: imsg_read", __func__);
409 		if (n == 0)
410 			_exit(0);
411 	}
412 
413 	if (event & EV_WRITE) {
414 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
415 			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
416 		if (n == 0)
417 			_exit(0);
418 	}
419 
420 	for (;;) {
421 		if ((n = imsg_get(ibuf, &imsg)) == -1)
422 			fatal("%s: imsg_get", __func__);
423 		if (n == 0)
424 			break;
425 
426 #if DEBUG > 1
427 		log_debug("%s: got imsg %d from %s",
428 		    __func__, imsg.hdr.type,
429 		    vm->vm_params.vmc_params.vcp_name);
430 #endif
431 
432 		switch (imsg.hdr.type) {
433 		case IMSG_CTL_VERBOSE:
434 			IMSG_SIZE_CHECK(&imsg, &verbose);
435 			memcpy(&verbose, imsg.data, sizeof(verbose));
436 			log_setverbose(verbose);
437 			break;
438 		case IMSG_VMDOP_VM_SHUTDOWN:
439 			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
440 				_exit(0);
441 			break;
442 		case IMSG_VMDOP_VM_REBOOT:
443 			if (vmmci_ctl(VMMCI_REBOOT) == -1)
444 				_exit(0);
445 			break;
446 		case IMSG_VMDOP_PAUSE_VM:
447 			vmr.vmr_result = 0;
448 			vmr.vmr_id = vm->vm_vmid;
449 			pause_vm(&vm->vm_params.vmc_params);
450 			imsg_compose_event(&vm->vm_iev,
451 			    IMSG_VMDOP_PAUSE_VM_RESPONSE,
452 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
453 			    sizeof(vmr));
454 			break;
455 		case IMSG_VMDOP_UNPAUSE_VM:
456 			vmr.vmr_result = 0;
457 			vmr.vmr_id = vm->vm_vmid;
458 			unpause_vm(&vm->vm_params.vmc_params);
459 			imsg_compose_event(&vm->vm_iev,
460 			    IMSG_VMDOP_UNPAUSE_VM_RESPONSE,
461 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
462 			    sizeof(vmr));
463 			break;
464 		case IMSG_VMDOP_SEND_VM_REQUEST:
465 			vmr.vmr_id = vm->vm_vmid;
466 			vmr.vmr_result = send_vm(imsg.fd,
467 			    &vm->vm_params.vmc_params);
468 			imsg_compose_event(&vm->vm_iev,
469 			    IMSG_VMDOP_SEND_VM_RESPONSE,
470 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
471 			    sizeof(vmr));
472 			break;
473 		default:
474 			fatalx("%s: got invalid imsg %d from %s",
475 			    __func__, imsg.hdr.type,
476 			    vm->vm_params.vmc_params.vcp_name);
477 		}
478 		imsg_free(&imsg);
479 	}
480 	imsg_event_add(iev);
481 }
482 
483 /*
484  * vm_ctl
485  *
486  * Tell the vmm parent process to shutdown or reboot the VM and exit.
487  */
488 __dead void
489 vm_shutdown(unsigned int cmd)
490 {
491 	switch (cmd) {
492 	case VMMCI_NONE:
493 	case VMMCI_SHUTDOWN:
494 		(void)imsg_compose_event(&current_vm->vm_iev,
495 		    IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0);
496 		break;
497 	case VMMCI_REBOOT:
498 		(void)imsg_compose_event(&current_vm->vm_iev,
499 		    IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0);
500 		break;
501 	default:
502 		fatalx("invalid vm ctl command: %d", cmd);
503 	}
504 	imsg_flush(&current_vm->vm_iev.ibuf);
505 
506 	_exit(0);
507 }
508 
509 int
510 send_vm(int fd, struct vm_create_params *vcp)
511 {
512 	struct vm_rwregs_params	   vrp;
513 	struct vm_rwvmparams_params vpp;
514 	struct vmop_create_params *vmc;
515 	struct vm_terminate_params vtp;
516 	unsigned int		   flags = 0;
517 	unsigned int		   i;
518 	int			   ret = 0;
519 	size_t			   sz;
520 
521 	if (dump_send_header(fd)) {
522 		log_info("%s: failed to send vm dump header", __func__);
523 		goto err;
524 	}
525 
526 	pause_vm(vcp);
527 
528 	vmc = calloc(1, sizeof(struct vmop_create_params));
529 	if (vmc == NULL) {
530 		log_warn("%s: calloc error geting vmc", __func__);
531 		ret = -1;
532 		goto err;
533 	}
534 
535 	flags |= VMOP_CREATE_MEMORY;
536 	memcpy(&vmc->vmc_params, &current_vm->vm_params, sizeof(struct
537 	    vmop_create_params));
538 	vmc->vmc_flags = flags;
539 	vrp.vrwp_vm_id = vcp->vcp_id;
540 	vrp.vrwp_mask = VM_RWREGS_ALL;
541 	vpp.vpp_mask = VM_RWVMPARAMS_ALL;
542 	vpp.vpp_vm_id = vcp->vcp_id;
543 
544 	sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params));
545 	if (sz != sizeof(struct vmop_create_params)) {
546 		ret = -1;
547 		goto err;
548 	}
549 
550 	for (i = 0; i < vcp->vcp_ncpus; i++) {
551 		vrp.vrwp_vcpu_id = i;
552 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) {
553 			log_warn("%s: readregs failed", __func__);
554 			goto err;
555 		}
556 
557 		sz = atomicio(vwrite, fd, &vrp,
558 		    sizeof(struct vm_rwregs_params));
559 		if (sz != sizeof(struct vm_rwregs_params)) {
560 			log_warn("%s: dumping registers failed", __func__);
561 			ret = -1;
562 			goto err;
563 		}
564 	}
565 
566 	if ((ret = i8253_dump(fd)))
567 		goto err;
568 	if ((ret = i8259_dump(fd)))
569 		goto err;
570 	if ((ret = ns8250_dump(fd)))
571 		goto err;
572 	if ((ret = mc146818_dump(fd)))
573 		goto err;
574 	if ((ret = fw_cfg_dump(fd)))
575 		goto err;
576 	if ((ret = pci_dump(fd)))
577 		goto err;
578 	if ((ret = virtio_dump(fd)))
579 		goto err;
580 	if ((ret = dump_mem(fd, vcp)))
581 		goto err;
582 
583 	for (i = 0; i < vcp->vcp_ncpus; i++) {
584 		vpp.vpp_vcpu_id = i;
585 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) {
586 			log_warn("%s: readvmparams failed", __func__);
587 			goto err;
588 		}
589 
590 		sz = atomicio(vwrite, fd, &vpp,
591 		    sizeof(struct vm_rwvmparams_params));
592 		if (sz != sizeof(struct vm_rwvmparams_params)) {
593 			log_warn("%s: dumping vm params failed", __func__);
594 			ret = -1;
595 			goto err;
596 		}
597 	}
598 
599 	vtp.vtp_vm_id = vcp->vcp_id;
600 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) {
601 		log_warnx("%s: term IOC error: %d, %d", __func__,
602 		    errno, ENOENT);
603 	}
604 err:
605 	close(fd);
606 	if (ret)
607 		unpause_vm(vcp);
608 	return ret;
609 }
610 
611 int
612 dump_send_header(int fd) {
613 	struct vm_dump_header	   vmh;
614 	int			   i;
615 
616 	memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
617 	    sizeof(vmh.vmh_signature));
618 
619 	vmh.vmh_cpuids[0].code = 0x00;
620 	vmh.vmh_cpuids[0].leaf = 0x00;
621 
622 	vmh.vmh_cpuids[1].code = 0x01;
623 	vmh.vmh_cpuids[1].leaf = 0x00;
624 
625 	vmh.vmh_cpuids[2].code = 0x07;
626 	vmh.vmh_cpuids[2].leaf = 0x00;
627 
628 	vmh.vmh_cpuids[3].code = 0x0d;
629 	vmh.vmh_cpuids[3].leaf = 0x00;
630 
631 	vmh.vmh_cpuids[4].code = 0x80000001;
632 	vmh.vmh_cpuids[4].leaf = 0x00;
633 
634 	vmh.vmh_version = VM_DUMP_VERSION;
635 
636 	for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
637 		CPUID_LEAF(vmh.vmh_cpuids[i].code,
638 		    vmh.vmh_cpuids[i].leaf,
639 		    vmh.vmh_cpuids[i].a,
640 		    vmh.vmh_cpuids[i].b,
641 		    vmh.vmh_cpuids[i].c,
642 		    vmh.vmh_cpuids[i].d);
643 	}
644 
645 	if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
646 		return (-1);
647 
648 	return (0);
649 }
650 
651 int
652 dump_mem(int fd, struct vm_create_params *vcp)
653 {
654 	unsigned int	i;
655 	int		ret;
656 	struct		vm_mem_range *vmr;
657 
658 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
659 		vmr = &vcp->vcp_memranges[i];
660 		ret = dump_vmr(fd, vmr);
661 		if (ret)
662 			return ret;
663 	}
664 	return (0);
665 }
666 
667 int
668 restore_vm_params(int fd, struct vm_create_params *vcp) {
669 	unsigned int			i;
670 	struct vm_rwvmparams_params    vpp;
671 
672 	for (i = 0; i < vcp->vcp_ncpus; i++) {
673 		if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) {
674 			log_warn("%s: error restoring vm params", __func__);
675 			return (-1);
676 		}
677 		vpp.vpp_vm_id = vcp->vcp_id;
678 		vpp.vpp_vcpu_id = i;
679 		if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) {
680 			log_debug("%s: writing vm params failed", __func__);
681 			return (-1);
682 		}
683 	}
684 	return (0);
685 }
686 
687 void
688 restore_mem(int fd, struct vm_create_params *vcp)
689 {
690 	unsigned int	     i;
691 	struct vm_mem_range *vmr;
692 
693 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
694 		vmr = &vcp->vcp_memranges[i];
695 		restore_vmr(fd, vmr);
696 	}
697 }
698 
699 int
700 dump_vmr(int fd, struct vm_mem_range *vmr)
701 {
702 	size_t	rem = vmr->vmr_size, read=0;
703 	char	buf[PAGE_SIZE];
704 
705 	while (rem > 0) {
706 		if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) {
707 			log_warn("failed to read vmr");
708 			return (-1);
709 		}
710 		if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) {
711 			log_warn("failed to dump vmr");
712 			return (-1);
713 		}
714 		rem = rem - PAGE_SIZE;
715 		read = read + PAGE_SIZE;
716 	}
717 	return (0);
718 }
719 
720 void
721 restore_vmr(int fd, struct vm_mem_range *vmr)
722 {
723 	size_t	rem = vmr->vmr_size, wrote=0;
724 	char	buf[PAGE_SIZE];
725 
726 	while (rem > 0) {
727 		if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf))
728 			fatal("failed to restore vmr");
729 		if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE))
730 			fatal("failed to write vmr");
731 		rem = rem - PAGE_SIZE;
732 		wrote = wrote + PAGE_SIZE;
733 	}
734 }
735 
736 void
737 pause_vm(struct vm_create_params *vcp)
738 {
739 	unsigned int n;
740 	int ret;
741 	if (current_vm->vm_state & VM_STATE_PAUSED)
742 		return;
743 
744 	current_vm->vm_state |= VM_STATE_PAUSED;
745 
746 	for (n = 0; n < vcp->vcp_ncpus; n++) {
747 		ret = pthread_mutex_lock(&vcpu_pause_mtx[n]);
748 		if (ret) {
749 			log_warnx("%s: can't lock vcpu pause mtx (%d)",
750 			    __func__, (int)ret);
751 			return;
752 		}
753 
754 		ret = pthread_cond_broadcast(&vcpu_run_cond[n]);
755 		if (ret) {
756 			log_warnx("%s: can't broadcast vcpu run cond (%d)",
757 			    __func__, (int)ret);
758 			return;
759 		}
760 
761 		ret = pthread_cond_wait(&vcpu_pause_cond[n], &vcpu_pause_mtx[n]);
762 		if (ret) {
763 			log_warnx("%s: can't wait on vcpu pause cond (%d)",
764 			    __func__, (int)ret);
765 			return;
766 		}
767 		ret = pthread_mutex_unlock(&vcpu_pause_mtx[n]);
768 		if (ret) {
769 			log_warnx("%s: can't unlock vcpu mtx (%d)",
770 			    __func__, (int)ret);
771 			return;
772 		}
773 	}
774 
775 	i8253_stop();
776 	mc146818_stop();
777 	ns8250_stop();
778 	virtio_stop(vcp);
779 }
780 
781 void
782 unpause_vm(struct vm_create_params *vcp)
783 {
784 	unsigned int n;
785 	int ret;
786 	if (!(current_vm->vm_state & VM_STATE_PAUSED))
787 		return;
788 
789 	current_vm->vm_state &= ~VM_STATE_PAUSED;
790 	for (n = 0; n < vcp->vcp_ncpus; n++) {
791 		ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]);
792 		if (ret) {
793 			log_warnx("%s: can't broadcast vcpu unpause cond (%d)",
794 			    __func__, (int)ret);
795 			return;
796 		}
797 	}
798 
799 	i8253_start();
800 	mc146818_start();
801 	ns8250_start();
802 	virtio_start(vcp);
803 }
804 
805 /*
806  * vcpu_reset
807  *
808  * Requests vmm(4) to reset the VCPUs in the indicated VM to
809  * the register state provided
810  *
811  * Parameters
812  *  vmid: VM ID to reset
813  *  vcpu_id: VCPU ID to reset
814  *  vrs: the register state to initialize
815  *
816  * Return values:
817  *  0: success
818  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
819  *      valid)
820  */
821 int
822 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
823 {
824 	struct vm_resetcpu_params vrp;
825 
826 	memset(&vrp, 0, sizeof(vrp));
827 	vrp.vrp_vm_id = vmid;
828 	vrp.vrp_vcpu_id = vcpu_id;
829 	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
830 
831 	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
832 
833 	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1)
834 		return (errno);
835 
836 	return (0);
837 }
838 
839 /*
840  * create_memory_map
841  *
842  * Sets up the guest physical memory ranges that the VM can access.
843  *
844  * Parameters:
845  *  vcp: VM create parameters describing the VM whose memory map
846  *       is being created
847  *
848  * Return values:
849  *  nothing
850  */
851 void
852 create_memory_map(struct vm_create_params *vcp)
853 {
854 	size_t len, mem_bytes, mem_mb;
855 
856 	mem_mb = vcp->vcp_memranges[0].vmr_size;
857 	vcp->vcp_nmemranges = 0;
858 	if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE)
859 		return;
860 
861 	mem_bytes = mem_mb * 1024 * 1024;
862 
863 	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
864 	len = LOWMEM_KB * 1024;
865 	vcp->vcp_memranges[0].vmr_gpa = 0x0;
866 	vcp->vcp_memranges[0].vmr_size = len;
867 	mem_bytes -= len;
868 
869 	/*
870 	 * Second memory region: LOWMEM_KB - 1MB.
871 	 *
872 	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
873 	 * We have to add this region, because some systems
874 	 * unconditionally write to 0xb8000 (VGA RAM), and
875 	 * we need to make sure that vmm(4) permits accesses
876 	 * to it. So allocate guest memory for it.
877 	 */
878 	len = 0x100000 - LOWMEM_KB * 1024;
879 	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
880 	vcp->vcp_memranges[1].vmr_size = len;
881 	mem_bytes -= len;
882 
883 	/* Make sure that we do not place physical memory into MMIO ranges. */
884 	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000)
885 		len = VMM_PCI_MMIO_BAR_BASE - 0x100000;
886 	else
887 		len = mem_bytes;
888 
889 	/* Third memory region: 1MB - (1MB + len) */
890 	vcp->vcp_memranges[2].vmr_gpa = 0x100000;
891 	vcp->vcp_memranges[2].vmr_size = len;
892 	mem_bytes -= len;
893 
894 	if (mem_bytes > 0) {
895 		/* Fourth memory region for the remaining memory (if any) */
896 		vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
897 		vcp->vcp_memranges[3].vmr_size = mem_bytes;
898 		vcp->vcp_nmemranges = 4;
899 	} else
900 		vcp->vcp_nmemranges = 3;
901 }
902 
903 /*
904  * alloc_guest_mem
905  *
906  * Allocates memory for the guest.
907  * Instead of doing a single allocation with one mmap(), we allocate memory
908  * separately for every range for the following reasons:
909  * - ASLR for the individual ranges
910  * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
911  *   map the single mmap'd userspace memory to the individual guest physical
912  *   memory ranges, the underlying amap of the single mmap'd range would have
913  *   to allocate per-page reference counters. The reason is that the
914  *   individual guest physical ranges would reference the single mmap'd region
915  *   only partially. However, if every guest physical range has its own
916  *   corresponding mmap'd userspace allocation, there are no partial
917  *   references: every guest physical range fully references an mmap'd
918  *   range => no per-page reference counters have to be allocated.
919  *
920  * Return values:
921  *  0: success
922  *  !0: failure - errno indicating the source of the failure
923  */
924 int
925 alloc_guest_mem(struct vm_create_params *vcp)
926 {
927 	void *p;
928 	int ret;
929 	size_t i, j;
930 	struct vm_mem_range *vmr;
931 
932 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
933 		vmr = &vcp->vcp_memranges[i];
934 		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
935 		    MAP_PRIVATE | MAP_ANON, -1, 0);
936 		if (p == MAP_FAILED) {
937 			ret = errno;
938 			for (j = 0; j < i; j++) {
939 				vmr = &vcp->vcp_memranges[j];
940 				munmap((void *)vmr->vmr_va, vmr->vmr_size);
941 			}
942 
943 			return (ret);
944 		}
945 
946 		vmr->vmr_va = (vaddr_t)p;
947 	}
948 
949 	return (0);
950 }
951 
952 /*
953  * vmm_create_vm
954  *
955  * Requests vmm(4) to create a new VM using the supplied creation
956  * parameters. This operation results in the creation of the in-kernel
957  * structures for the VM, but does not start the VM's vcpu(s).
958  *
959  * Parameters:
960  *  vcp: vm_create_params struct containing the VM's desired creation
961  *      configuration
962  *
963  * Return values:
964  *  0: success
965  *  !0 : ioctl to vmm(4) failed
966  */
967 int
968 vmm_create_vm(struct vm_create_params *vcp)
969 {
970 	/* Sanity check arguments */
971 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
972 		return (EINVAL);
973 
974 	if (vcp->vcp_nmemranges == 0 ||
975 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
976 		return (EINVAL);
977 
978 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
979 		return (EINVAL);
980 
981 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
982 		return (EINVAL);
983 
984 	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1)
985 		return (errno);
986 
987 	return (0);
988 }
989 
990 /*
991  * init_emulated_hw
992  *
993  * Initializes the userspace hardware emulation
994  */
995 void
996 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
997     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
998 {
999 	struct vm_create_params *vcp = &vmc->vmc_params;
1000 	int i;
1001 	uint64_t memlo, memhi;
1002 
1003 	/* Calculate memory size for NVRAM registers */
1004 	memlo = memhi = 0;
1005 	if (vcp->vcp_nmemranges > 2)
1006 		memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000;
1007 
1008 	if (vcp->vcp_nmemranges > 3)
1009 		memhi = vcp->vcp_memranges[3].vmr_size;
1010 
1011 	/* Reset the IO port map */
1012 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1013 
1014 	/* Init i8253 PIT */
1015 	i8253_init(vcp->vcp_id);
1016 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1017 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1018 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1019 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1020 	ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
1021 
1022 	/* Init mc146818 RTC */
1023 	mc146818_init(vcp->vcp_id, memlo, memhi);
1024 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1025 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1026 
1027 	/* Init master and slave PICs */
1028 	i8259_init();
1029 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1030 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1031 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1032 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1033 	ioports_map[ELCR0] = vcpu_exit_elcr;
1034 	ioports_map[ELCR1] = vcpu_exit_elcr;
1035 
1036 	/* Init ns8250 UART */
1037 	ns8250_init(con_fd, vcp->vcp_id);
1038 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1039 		ioports_map[i] = vcpu_exit_com;
1040 
1041 	/* Init QEMU fw_cfg interface */
1042 	fw_cfg_init(vmc);
1043 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1044 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1045 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1046 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1047 
1048 	/* Initialize PCI */
1049 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
1050 		ioports_map[i] = vcpu_exit_pci;
1051 
1052 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1053 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1054 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1055 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1056 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1057 	pci_init();
1058 
1059 	/* Initialize virtio devices */
1060 	virtio_init(current_vm, child_cdrom, child_disks, child_taps);
1061 }
1062 /*
1063  * restore_emulated_hw
1064  *
1065  * Restores the userspace hardware emulation from fd
1066  */
1067 void
1068 restore_emulated_hw(struct vm_create_params *vcp, int fd,
1069     int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
1070 {
1071 	/* struct vm_create_params *vcp = &vmc->vmc_params; */
1072 	int i;
1073 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1074 
1075 	/* Init i8253 PIT */
1076 	i8253_restore(fd, vcp->vcp_id);
1077 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1078 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1079 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1080 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1081 
1082 	/* Init master and slave PICs */
1083 	i8259_restore(fd);
1084 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1085 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1086 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1087 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1088 
1089 	/* Init ns8250 UART */
1090 	ns8250_restore(fd, con_fd, vcp->vcp_id);
1091 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1092 		ioports_map[i] = vcpu_exit_com;
1093 
1094 	/* Init mc146818 RTC */
1095 	mc146818_restore(fd, vcp->vcp_id);
1096 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1097 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1098 
1099 	/* Init QEMU fw_cfg interface */
1100 	fw_cfg_restore(fd);
1101 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1102 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1103 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1104 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1105 
1106 	/* Initialize PCI */
1107 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
1108 		ioports_map[i] = vcpu_exit_pci;
1109 
1110 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1111 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1112 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1113 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1114 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1115 	pci_restore(fd);
1116 	virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
1117 }
1118 
1119 /*
1120  * run_vm
1121  *
1122  * Runs the VM whose creation parameters are specified in vcp
1123  *
1124  * Parameters:
1125  *  child_cdrom: previously-opened child ISO disk file descriptor
1126  *  child_disks: previously-opened child VM disk file file descriptors
1127  *  child_taps: previously-opened child tap file descriptors
1128  *  vmc: vmop_create_params struct containing the VM's desired creation
1129  *      configuration
1130  *  vrs: VCPU register state to initialize
1131  *
1132  * Return values:
1133  *  0: the VM exited normally
1134  *  !0 : the VM exited abnormally or failed to start
1135  */
1136 int
1137 run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK],
1138     int *child_taps, struct vmop_create_params *vmc,
1139     struct vcpu_reg_state *vrs)
1140 {
1141 	struct vm_create_params *vcp = &vmc->vmc_params;
1142 	struct vm_rwregs_params vregsp;
1143 	uint8_t evdone = 0;
1144 	size_t i;
1145 	int ret;
1146 	pthread_t *tid, evtid;
1147 	struct vm_run_params **vrp;
1148 	void *exit_status;
1149 
1150 	if (vcp == NULL)
1151 		return (EINVAL);
1152 
1153 	if (child_cdrom == -1 && strlen(vcp->vcp_cdrom))
1154 		return (EINVAL);
1155 
1156 	if (child_disks == NULL && vcp->vcp_ndisks != 0)
1157 		return (EINVAL);
1158 
1159 	if (child_taps == NULL && vcp->vcp_nnics != 0)
1160 		return (EINVAL);
1161 
1162 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
1163 		return (EINVAL);
1164 
1165 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
1166 		return (EINVAL);
1167 
1168 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
1169 		return (EINVAL);
1170 
1171 	if (vcp->vcp_nmemranges == 0 ||
1172 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1173 		return (EINVAL);
1174 
1175 	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
1176 	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
1177 	if (tid == NULL || vrp == NULL) {
1178 		log_warn("%s: memory allocation error - exiting.",
1179 		    __progname);
1180 		return (ENOMEM);
1181 	}
1182 
1183 	log_debug("%s: initializing hardware for vm %s", __func__,
1184 	    vcp->vcp_name);
1185 
1186 	if (!(current_vm->vm_state & VM_STATE_RECEIVED))
1187 		init_emulated_hw(vmc, child_cdrom, child_disks, child_taps);
1188 
1189 	ret = pthread_mutex_init(&threadmutex, NULL);
1190 	if (ret) {
1191 		log_warn("%s: could not initialize thread state mutex",
1192 		    __func__);
1193 		return (ret);
1194 	}
1195 	ret = pthread_cond_init(&threadcond, NULL);
1196 	if (ret) {
1197 		log_warn("%s: could not initialize thread state "
1198 		    "condition variable", __func__);
1199 		return (ret);
1200 	}
1201 
1202 	mutex_lock(&threadmutex);
1203 
1204 	log_debug("%s: starting vcpu threads for vm %s", __func__,
1205 	    vcp->vcp_name);
1206 
1207 	/*
1208 	 * Create and launch one thread for each VCPU. These threads may
1209 	 * migrate between PCPUs over time; the need to reload CPU state
1210 	 * in such situations is detected and performed by vmm(4) in the
1211 	 * kernel.
1212 	 */
1213 	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
1214 		vrp[i] = malloc(sizeof(struct vm_run_params));
1215 		if (vrp[i] == NULL) {
1216 			log_warn("%s: memory allocation error - "
1217 			    "exiting.", __progname);
1218 			/* caller will exit, so skip freeing */
1219 			return (ENOMEM);
1220 		}
1221 		vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit));
1222 		if (vrp[i]->vrp_exit == NULL) {
1223 			log_warn("%s: memory allocation error - "
1224 			    "exiting.", __progname);
1225 			/* caller will exit, so skip freeing */
1226 			return (ENOMEM);
1227 		}
1228 		vrp[i]->vrp_vm_id = vcp->vcp_id;
1229 		vrp[i]->vrp_vcpu_id = i;
1230 
1231 		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
1232 			log_warnx("%s: cannot reset VCPU %zu - exiting.",
1233 			    __progname, i);
1234 			return (EIO);
1235 		}
1236 
1237 		/* once more because reset_cpu changes regs */
1238 		if (current_vm->vm_state & VM_STATE_RECEIVED) {
1239 			vregsp.vrwp_vm_id = vcp->vcp_id;
1240 			vregsp.vrwp_vcpu_id = i;
1241 			vregsp.vrwp_regs = *vrs;
1242 			vregsp.vrwp_mask = VM_RWREGS_ALL;
1243 			if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS,
1244 			    &vregsp)) == -1) {
1245 				log_warn("%s: writeregs failed", __func__);
1246 				return (ret);
1247 			}
1248 		}
1249 
1250 		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
1251 		if (ret) {
1252 			log_warnx("%s: cannot initialize cond var (%d)",
1253 			    __progname, ret);
1254 			return (ret);
1255 		}
1256 
1257 		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
1258 		if (ret) {
1259 			log_warnx("%s: cannot initialize mtx (%d)",
1260 			    __progname, ret);
1261 			return (ret);
1262 		}
1263 		ret = pthread_cond_init(&vcpu_pause_cond[i], NULL);
1264 		if (ret) {
1265 			log_warnx("%s: cannot initialize pause cond var (%d)",
1266 			    __progname, ret);
1267 			return (ret);
1268 		}
1269 
1270 		ret = pthread_mutex_init(&vcpu_pause_mtx[i], NULL);
1271 		if (ret) {
1272 			log_warnx("%s: cannot initialize pause mtx (%d)",
1273 			    __progname, ret);
1274 			return (ret);
1275 		}
1276 		ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL);
1277 		if (ret) {
1278 			log_warnx("%s: cannot initialize unpause var (%d)",
1279 			    __progname, ret);
1280 			return (ret);
1281 		}
1282 
1283 		ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL);
1284 		if (ret) {
1285 			log_warnx("%s: cannot initialize unpause mtx (%d)",
1286 			    __progname, ret);
1287 			return (ret);
1288 		}
1289 
1290 		vcpu_hlt[i] = 0;
1291 
1292 		/* Start each VCPU run thread at vcpu_run_loop */
1293 		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
1294 		if (ret) {
1295 			/* caller will _exit after this return */
1296 			ret = errno;
1297 			log_warn("%s: could not create vcpu thread %zu",
1298 			    __func__, i);
1299 			return (ret);
1300 		}
1301 	}
1302 
1303 	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
1304 	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
1305 	if (ret) {
1306 		errno = ret;
1307 		log_warn("%s: could not create event thread", __func__);
1308 		return (ret);
1309 	}
1310 
1311 	for (;;) {
1312 		ret = pthread_cond_wait(&threadcond, &threadmutex);
1313 		if (ret) {
1314 			log_warn("%s: waiting on thread state condition "
1315 			    "variable failed", __func__);
1316 			return (ret);
1317 		}
1318 
1319 		/*
1320 		 * Did a VCPU thread exit with an error? => return the first one
1321 		 */
1322 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1323 			if (vcpu_done[i] == 0)
1324 				continue;
1325 
1326 			if (pthread_join(tid[i], &exit_status)) {
1327 				log_warn("%s: failed to join thread %zd - "
1328 				    "exiting", __progname, i);
1329 				return (EIO);
1330 			}
1331 
1332 			ret = (intptr_t)exit_status;
1333 		}
1334 
1335 		/* Did the event thread exit? => return with an error */
1336 		if (evdone) {
1337 			if (pthread_join(evtid, &exit_status)) {
1338 				log_warn("%s: failed to join event thread - "
1339 				    "exiting", __progname);
1340 				return (EIO);
1341 			}
1342 
1343 			log_warnx("%s: vm %d event thread exited "
1344 			    "unexpectedly", __progname, vcp->vcp_id);
1345 			return (EIO);
1346 		}
1347 
1348 		/* Did all VCPU threads exit successfully? => return */
1349 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1350 			if (vcpu_done[i] == 0)
1351 				break;
1352 		}
1353 		if (i == vcp->vcp_ncpus)
1354 			return (ret);
1355 
1356 		/* Some more threads to wait for, start over */
1357 	}
1358 
1359 	return (ret);
1360 }
1361 
1362 void *
1363 event_thread(void *arg)
1364 {
1365 	uint8_t *donep = arg;
1366 	intptr_t ret;
1367 
1368 	ret = event_dispatch();
1369 
1370 	mutex_lock(&threadmutex);
1371 	*donep = 1;
1372 	pthread_cond_signal(&threadcond);
1373 	mutex_unlock(&threadmutex);
1374 
1375 	return (void *)ret;
1376  }
1377 
1378 /*
1379  * vcpu_run_loop
1380  *
1381  * Runs a single VCPU until vmm(4) requires help handling an exit,
1382  * or the VM terminates.
1383  *
1384  * Parameters:
1385  *  arg: vcpu_run_params for the VCPU being run by this thread
1386  *
1387  * Return values:
1388  *  NULL: the VCPU shutdown properly
1389  *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1390  */
1391 void *
1392 vcpu_run_loop(void *arg)
1393 {
1394 	struct vm_run_params *vrp = (struct vm_run_params *)arg;
1395 	intptr_t ret = 0;
1396 	int irq;
1397 	uint32_t n;
1398 
1399 	vrp->vrp_continue = 0;
1400 	n = vrp->vrp_vcpu_id;
1401 
1402 	for (;;) {
1403 		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1404 
1405 		if (ret) {
1406 			log_warnx("%s: can't lock vcpu run mtx (%d)",
1407 			    __func__, (int)ret);
1408 			return ((void *)ret);
1409 		}
1410 
1411 		/* If we are halted and need to pause, pause */
1412 		if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) {
1413 			ret = pthread_cond_broadcast(&vcpu_pause_cond[n]);
1414 			if (ret) {
1415 				log_warnx("%s: can't broadcast vcpu pause mtx"
1416 				    "(%d)", __func__, (int)ret);
1417 				return ((void *)ret);
1418 			}
1419 
1420 			ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]);
1421 			if (ret) {
1422 				log_warnx("%s: can't lock vcpu unpause mtx (%d)",
1423 				    __func__, (int)ret);
1424 				return ((void *)ret);
1425 			}
1426 
1427 			ret = pthread_cond_wait(&vcpu_unpause_cond[n],
1428 			    &vcpu_unpause_mtx[n]);
1429 			if (ret) {
1430 				log_warnx(
1431 				    "%s: can't wait on unpause cond (%d)",
1432 				    __func__, (int)ret);
1433 				break;
1434 			}
1435 			ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]);
1436 			if (ret) {
1437 				log_warnx("%s: can't unlock unpause mtx (%d)",
1438 				    __func__, (int)ret);
1439 				break;
1440 			}
1441 		}
1442 
1443 		/* If we are halted and not paused, wait */
1444 		if (vcpu_hlt[n]) {
1445 			ret = pthread_cond_wait(&vcpu_run_cond[n],
1446 			    &vcpu_run_mtx[n]);
1447 
1448 			if (ret) {
1449 				log_warnx(
1450 				    "%s: can't wait on cond (%d)",
1451 				    __func__, (int)ret);
1452 				(void)pthread_mutex_unlock(
1453 				    &vcpu_run_mtx[n]);
1454 				break;
1455 			}
1456 		}
1457 
1458 		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1459 
1460 		if (ret) {
1461 			log_warnx("%s: can't unlock mutex on cond (%d)",
1462 			    __func__, (int)ret);
1463 			break;
1464 		}
1465 
1466 		if (vrp->vrp_irqready && i8259_is_pending()) {
1467 			irq = i8259_ack();
1468 			vrp->vrp_irq = irq;
1469 		} else
1470 			vrp->vrp_irq = 0xFFFF;
1471 
1472 		/* Still more pending? */
1473 		if (i8259_is_pending()) {
1474 			/* XXX can probably avoid ioctls here by providing intr in vrp */
1475 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1476 			    vrp->vrp_vcpu_id, 1)) {
1477 				fatal("can't set INTR");
1478 			}
1479 		} else {
1480 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1481 			    vrp->vrp_vcpu_id, 0)) {
1482 				fatal("can't clear INTR");
1483 			}
1484 		}
1485 
1486 		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) {
1487 			/* If run ioctl failed, exit */
1488 			ret = errno;
1489 			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1490 			    __func__, vrp->vrp_vm_id, n);
1491 			break;
1492 		}
1493 
1494 		/* If the VM is terminating, exit normally */
1495 		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
1496 			ret = (intptr_t)NULL;
1497 			break;
1498 		}
1499 
1500 		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
1501 			/*
1502 			 * vmm(4) needs help handling an exit, handle in
1503 			 * vcpu_exit.
1504 			 */
1505 			ret = vcpu_exit(vrp);
1506 			if (ret)
1507 				break;
1508 		}
1509 	}
1510 
1511 	mutex_lock(&threadmutex);
1512 	vcpu_done[n] = 1;
1513 	pthread_cond_signal(&threadcond);
1514 	mutex_unlock(&threadmutex);
1515 
1516 	return ((void *)ret);
1517 }
1518 
1519 int
1520 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1521 {
1522 	struct vm_intr_params vip;
1523 
1524 	memset(&vip, 0, sizeof(vip));
1525 
1526 	vip.vip_vm_id = vm_id;
1527 	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1528 	vip.vip_intr = intr;
1529 
1530 	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1)
1531 		return (errno);
1532 
1533 	return (0);
1534 }
1535 
1536 /*
1537  * vcpu_exit_pci
1538  *
1539  * Handle all I/O to the emulated PCI subsystem.
1540  *
1541  * Parameters:
1542  *  vrp: vcpu run paramters containing guest state for this exit
1543  *
1544  * Return value:
1545  *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
1546  *      be injected.
1547  */
1548 uint8_t
1549 vcpu_exit_pci(struct vm_run_params *vrp)
1550 {
1551 	struct vm_exit *vei = vrp->vrp_exit;
1552 	uint8_t intr;
1553 
1554 	intr = 0xFF;
1555 
1556 	switch (vei->vei.vei_port) {
1557 	case PCI_MODE1_ADDRESS_REG:
1558 		pci_handle_address_reg(vrp);
1559 		break;
1560 	case PCI_MODE1_DATA_REG:
1561 	case PCI_MODE1_DATA_REG + 1:
1562 	case PCI_MODE1_DATA_REG + 2:
1563 	case PCI_MODE1_DATA_REG + 3:
1564 		pci_handle_data_reg(vrp);
1565 		break;
1566 	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
1567 		intr = pci_handle_io(vrp);
1568 		break;
1569 	default:
1570 		log_warnx("%s: unknown PCI register 0x%llx",
1571 		    __progname, (uint64_t)vei->vei.vei_port);
1572 		break;
1573 	}
1574 
1575 	return (intr);
1576 }
1577 
1578 /*
1579  * vcpu_exit_inout
1580  *
1581  * Handle all I/O exits that need to be emulated in vmd. This includes the
1582  * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
1583  *
1584  * Parameters:
1585  *  vrp: vcpu run parameters containing guest state for this exit
1586  */
1587 void
1588 vcpu_exit_inout(struct vm_run_params *vrp)
1589 {
1590 	struct vm_exit *vei = vrp->vrp_exit;
1591 	uint8_t intr = 0xFF;
1592 
1593 	if (ioports_map[vei->vei.vei_port] != NULL)
1594 		intr = ioports_map[vei->vei.vei_port](vrp);
1595 	else if (vei->vei.vei_dir == VEI_DIR_IN)
1596 			set_return_data(vei, 0xFFFFFFFF);
1597 
1598 	if (intr != 0xFF)
1599 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
1600 }
1601 
1602 /*
1603  * vcpu_exit
1604  *
1605  * Handle a vcpu exit. This function is called when it is determined that
1606  * vmm(4) requires the assistance of vmd to support a particular guest
1607  * exit type (eg, accessing an I/O port or device). Guest state is contained
1608  * in 'vrp', and will be resent to vmm(4) on exit completion.
1609  *
1610  * Upon conclusion of handling the exit, the function determines if any
1611  * interrupts should be injected into the guest, and asserts the proper
1612  * IRQ line whose interrupt should be vectored.
1613  *
1614  * Parameters:
1615  *  vrp: vcpu run parameters containing guest state for this exit
1616  *
1617  * Return values:
1618  *  0: the exit was handled successfully
1619  *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
1620  */
1621 int
1622 vcpu_exit(struct vm_run_params *vrp)
1623 {
1624 	int ret;
1625 
1626 	switch (vrp->vrp_exit_reason) {
1627 	case VMX_EXIT_INT_WINDOW:
1628 	case SVM_VMEXIT_VINTR:
1629 	case VMX_EXIT_CPUID:
1630 	case VMX_EXIT_EXTINT:
1631 	case SVM_VMEXIT_INTR:
1632 	case VMX_EXIT_EPT_VIOLATION:
1633 	case SVM_VMEXIT_NPF:
1634 	case SVM_VMEXIT_MSR:
1635 	case SVM_VMEXIT_CPUID:
1636 		/*
1637 		 * We may be exiting to vmd to handle a pending interrupt but
1638 		 * at the same time the last exit type may have been one of
1639 		 * these. In this case, there's nothing extra to be done
1640 		 * here (and falling through to the default case below results
1641 		 * in more vmd log spam).
1642 		 */
1643 		break;
1644 	case VMX_EXIT_IO:
1645 	case SVM_VMEXIT_IOIO:
1646 		vcpu_exit_inout(vrp);
1647 		break;
1648 	case VMX_EXIT_HLT:
1649 	case SVM_VMEXIT_HLT:
1650 		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1651 		if (ret) {
1652 			log_warnx("%s: can't lock vcpu mutex (%d)",
1653 			    __func__, ret);
1654 			return (ret);
1655 		}
1656 		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
1657 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1658 		if (ret) {
1659 			log_warnx("%s: can't unlock vcpu mutex (%d)",
1660 			    __func__, ret);
1661 			return (ret);
1662 		}
1663 		break;
1664 	case VMX_EXIT_TRIPLE_FAULT:
1665 	case SVM_VMEXIT_SHUTDOWN:
1666 		/* reset VM */
1667 		return (EAGAIN);
1668 	default:
1669 		log_debug("%s: unknown exit reason 0x%x",
1670 		    __progname, vrp->vrp_exit_reason);
1671 	}
1672 
1673 	/* Process any pending traffic */
1674 	vionet_process_rx(vrp->vrp_vm_id);
1675 
1676 	vrp->vrp_continue = 1;
1677 
1678 	return (0);
1679 }
1680 
1681 /*
1682  * find_gpa_range
1683  *
1684  * Search for a contiguous guest physical mem range.
1685  *
1686  * Parameters:
1687  *  vcp: VM create parameters that contain the memory map to search in
1688  *  gpa: the starting guest physical address
1689  *  len: the length of the memory range
1690  *
1691  * Return values:
1692  *  NULL: on failure if there is no memory range as described by the parameters
1693  *  Pointer to vm_mem_range that contains the start of the range otherwise.
1694  */
1695 static struct vm_mem_range *
1696 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
1697 {
1698 	size_t i, n;
1699 	struct vm_mem_range *vmr;
1700 
1701 	/* Find the first vm_mem_range that contains gpa */
1702 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1703 		vmr = &vcp->vcp_memranges[i];
1704 		if (vmr->vmr_gpa + vmr->vmr_size >= gpa)
1705 			break;
1706 	}
1707 
1708 	/* No range found. */
1709 	if (i == vcp->vcp_nmemranges)
1710 		return (NULL);
1711 
1712 	/*
1713 	 * vmr may cover the range [gpa, gpa + len) only partly. Make
1714 	 * sure that the following vm_mem_ranges are contiguous and
1715 	 * cover the rest.
1716 	 */
1717 	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
1718 	if (len < n)
1719 		len = 0;
1720 	else
1721 		len -= n;
1722 	gpa = vmr->vmr_gpa + vmr->vmr_size;
1723 	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
1724 		vmr = &vcp->vcp_memranges[i];
1725 		if (gpa != vmr->vmr_gpa)
1726 			return (NULL);
1727 		if (len <= vmr->vmr_size)
1728 			len = 0;
1729 		else
1730 			len -= vmr->vmr_size;
1731 
1732 		gpa = vmr->vmr_gpa + vmr->vmr_size;
1733 	}
1734 
1735 	if (len != 0)
1736 		return (NULL);
1737 
1738 	return (vmr);
1739 }
1740 
1741 void *
1742 vaddr_mem(paddr_t gpa, size_t len)
1743 {
1744 	struct vm_create_params *vcp = &current_vm->vm_params.vmc_params;
1745 	size_t i;
1746 	struct vm_mem_range *vmr;
1747 	paddr_t gpend = gpa + len;
1748 
1749 	/* Find the first vm_mem_range that contains gpa */
1750 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1751 		vmr = &vcp->vcp_memranges[i];
1752 		if (gpa < vmr->vmr_gpa)
1753 			continue;
1754 
1755 		if (gpend >= vmr->vmr_gpa + vmr->vmr_size)
1756 			continue;
1757 
1758 		return ((char *)vmr->vmr_va + (gpa - vmr->vmr_gpa));
1759 	}
1760 
1761 	return (NULL);
1762 }
1763 
1764 /*
1765  * write_mem
1766  *
1767  * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
1768  *
1769  * Parameters:
1770  *  dst: the destination paddr_t in the guest VM
1771  *  buf: data to copy (or NULL to zero the data)
1772  *  len: number of bytes to copy
1773  *
1774  * Return values:
1775  *  0: success
1776  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1777  *      exist in the guest.
1778  */
1779 int
1780 write_mem(paddr_t dst, const void *buf, size_t len)
1781 {
1782 	const char *from = buf;
1783 	char *to;
1784 	size_t n, off;
1785 	struct vm_mem_range *vmr;
1786 
1787 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
1788 	if (vmr == NULL) {
1789 		errno = EINVAL;
1790 		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
1791 		    "len = 0x%zx", __func__, dst, len);
1792 		return (EINVAL);
1793 	}
1794 
1795 	off = dst - vmr->vmr_gpa;
1796 	while (len != 0) {
1797 		n = vmr->vmr_size - off;
1798 		if (len < n)
1799 			n = len;
1800 
1801 		to = (char *)vmr->vmr_va + off;
1802 		if (buf == NULL)
1803 			memset(to, 0, n);
1804 		else {
1805 			memcpy(to, from, n);
1806 			from += n;
1807 		}
1808 		len -= n;
1809 		off = 0;
1810 		vmr++;
1811 	}
1812 
1813 	return (0);
1814 }
1815 
1816 /*
1817  * read_mem
1818  *
1819  * Reads memory at guest paddr 'src' into 'buf'.
1820  *
1821  * Parameters:
1822  *  src: the source paddr_t in the guest VM to read from.
1823  *  buf: destination (local) buffer
1824  *  len: number of bytes to read
1825  *
1826  * Return values:
1827  *  0: success
1828  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1829  *      exist in the guest.
1830  */
1831 int
1832 read_mem(paddr_t src, void *buf, size_t len)
1833 {
1834 	char *from, *to = buf;
1835 	size_t n, off;
1836 	struct vm_mem_range *vmr;
1837 
1838 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
1839 	if (vmr == NULL) {
1840 		errno = EINVAL;
1841 		log_warn("%s: failed - invalid memory range src = 0x%lx, "
1842 		    "len = 0x%zx", __func__, src, len);
1843 		return (EINVAL);
1844 	}
1845 
1846 	off = src - vmr->vmr_gpa;
1847 	while (len != 0) {
1848 		n = vmr->vmr_size - off;
1849 		if (len < n)
1850 			n = len;
1851 
1852 		from = (char *)vmr->vmr_va + off;
1853 		memcpy(to, from, n);
1854 
1855 		to += n;
1856 		len -= n;
1857 		off = 0;
1858 		vmr++;
1859 	}
1860 
1861 	return (0);
1862 }
1863 
1864 int
1865 iovec_mem(paddr_t src, size_t len, struct iovec *iov, int iovcnt)
1866 {
1867 	size_t n, off;
1868 	struct vm_mem_range *vmr;
1869 	int niov = 0;
1870 
1871 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
1872 	if (vmr == NULL) {
1873 		errno = EINVAL;
1874 		return (-1);
1875 	}
1876 
1877 	off = src - vmr->vmr_gpa;
1878 	while (len > 0) {
1879 		if (niov == iovcnt) {
1880 			errno = ENOMEM;
1881 			return (-1);
1882 		}
1883 
1884 		n = vmr->vmr_size - off;
1885 		if (len < n)
1886 			n = len;
1887 
1888 		iov[niov].iov_base = (char *)vmr->vmr_va + off;
1889 		iov[niov].iov_len = n;
1890 
1891 		niov++;
1892 
1893 		len -= n;
1894 		off = 0;
1895 		vmr++;
1896 	}
1897 
1898 	return (niov);
1899 }
1900 
1901 /*
1902  * vcpu_assert_pic_irq
1903  *
1904  * Injects the specified IRQ on the supplied vcpu/vm
1905  *
1906  * Parameters:
1907  *  vm_id: VM ID to inject to
1908  *  vcpu_id: VCPU ID to inject to
1909  *  irq: IRQ to inject
1910  */
1911 void
1912 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1913 {
1914 	int ret;
1915 
1916 	i8259_assert_irq(irq);
1917 
1918 	if (i8259_is_pending()) {
1919 		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
1920 			fatalx("%s: can't assert INTR", __func__);
1921 
1922 		ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
1923 		if (ret)
1924 			fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
1925 
1926 		vcpu_hlt[vcpu_id] = 0;
1927 		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
1928 		if (ret)
1929 			fatalx("%s: can't signal (%d)", __func__, ret);
1930 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
1931 		if (ret)
1932 			fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
1933 	}
1934 }
1935 
1936 /*
1937  * vcpu_deassert_pic_irq
1938  *
1939  * Clears the specified IRQ on the supplied vcpu/vm
1940  *
1941  * Parameters:
1942  *  vm_id: VM ID to clear in
1943  *  vcpu_id: VCPU ID to clear in
1944  *  irq: IRQ to clear
1945  */
1946 void
1947 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1948 {
1949 	i8259_deassert_irq(irq);
1950 
1951 	if (!i8259_is_pending()) {
1952 		if (vcpu_pic_intr(vm_id, vcpu_id, 0))
1953 			fatalx("%s: can't deassert INTR for vm_id %d, "
1954 			    "vcpu_id %d", __func__, vm_id, vcpu_id);
1955 	}
1956 }
1957 
1958 /*
1959  * fd_hasdata
1960  *
1961  * Determines if data can be read from a file descriptor.
1962  *
1963  * Parameters:
1964  *  fd: the fd to check
1965  *
1966  * Return values:
1967  *  1 if data can be read from an fd, or 0 otherwise.
1968  */
1969 int
1970 fd_hasdata(int fd)
1971 {
1972 	struct pollfd pfd[1];
1973 	int nready, hasdata = 0;
1974 
1975 	pfd[0].fd = fd;
1976 	pfd[0].events = POLLIN;
1977 	nready = poll(pfd, 1, 0);
1978 	if (nready == -1)
1979 		log_warn("checking file descriptor for data failed");
1980 	else if (nready == 1 && pfd[0].revents & POLLIN)
1981 		hasdata = 1;
1982 	return (hasdata);
1983 }
1984 
1985 /*
1986  * mutex_lock
1987  *
1988  * Wrapper function for pthread_mutex_lock that does error checking and that
1989  * exits on failure
1990  */
1991 void
1992 mutex_lock(pthread_mutex_t *m)
1993 {
1994 	int ret;
1995 
1996 	ret = pthread_mutex_lock(m);
1997 	if (ret) {
1998 		errno = ret;
1999 		fatal("could not acquire mutex");
2000 	}
2001 }
2002 
2003 /*
2004  * mutex_unlock
2005  *
2006  * Wrapper function for pthread_mutex_unlock that does error checking and that
2007  * exits on failure
2008  */
2009 void
2010 mutex_unlock(pthread_mutex_t *m)
2011 {
2012 	int ret;
2013 
2014 	ret = pthread_mutex_unlock(m);
2015 	if (ret) {
2016 		errno = ret;
2017 		fatal("could not release mutex");
2018 	}
2019 }
2020 
2021 /*
2022  * set_return_data
2023  *
2024  * Utility function for manipulating register data in vm exit info structs. This
2025  * function ensures that the data is copied to the vei->vei.vei_data field with
2026  * the proper size for the operation being performed.
2027  *
2028  * Parameters:
2029  *  vei: exit information
2030  *  data: return data
2031  */
2032 void
2033 set_return_data(struct vm_exit *vei, uint32_t data)
2034 {
2035 	switch (vei->vei.vei_size) {
2036 	case 1:
2037 		vei->vei.vei_data &= ~0xFF;
2038 		vei->vei.vei_data |= (uint8_t)data;
2039 		break;
2040 	case 2:
2041 		vei->vei.vei_data &= ~0xFFFF;
2042 		vei->vei.vei_data |= (uint16_t)data;
2043 		break;
2044 	case 4:
2045 		vei->vei.vei_data = data;
2046 		break;
2047 	}
2048 }
2049 
2050 /*
2051  * get_input_data
2052  *
2053  * Utility function for manipulating register data in vm exit info
2054  * structs. This function ensures that the data is copied from the
2055  * vei->vei.vei_data field with the proper size for the operation being
2056  * performed.
2057  *
2058  * Parameters:
2059  *  vei: exit information
2060  *  data: location to store the result
2061  */
2062 void
2063 get_input_data(struct vm_exit *vei, uint32_t *data)
2064 {
2065 	switch (vei->vei.vei_size) {
2066 	case 1:
2067 		*data &= 0xFFFFFF00;
2068 		*data |= (uint8_t)vei->vei.vei_data;
2069 		break;
2070 	case 2:
2071 		*data &= 0xFFFF0000;
2072 		*data |= (uint16_t)vei->vei.vei_data;
2073 		break;
2074 	case 4:
2075 		*data = vei->vei.vei_data;
2076 		break;
2077 	default:
2078 		log_warnx("%s: invalid i/o size %d", __func__,
2079 		    vei->vei.vei_size);
2080 	}
2081 
2082 }
2083 
2084 /*
2085  * translate_gva
2086  *
2087  * Translates a guest virtual address to a guest physical address by walking
2088  * the currently active page table (if needed).
2089  *
2090  * Note - this function can possibly alter the supplied VCPU state.
2091  *  Specifically, it may inject exceptions depending on the current VCPU
2092  *  configuration, and may alter %cr2 on #PF. Consequently, this function
2093  *  should only be used as part of instruction emulation.
2094  *
2095  * Parameters:
2096  *  exit: The VCPU this translation should be performed for (guest MMU settings
2097  *   are gathered from this VCPU)
2098  *  va: virtual address to translate
2099  *  pa: pointer to paddr_t variable that will receive the translated physical
2100  *   address. 'pa' is unchanged on error.
2101  *  mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
2102  *   the address should be translated
2103  *
2104  * Return values:
2105  *  0: the address was successfully translated - 'pa' contains the physical
2106  *     address currently mapped by 'va'.
2107  *  EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
2108  *     and %cr2 set in the vcpu structure.
2109  *  EINVAL: an error occurred reading paging table structures
2110  */
2111 int
2112 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
2113 {
2114 	int level, shift, pdidx;
2115 	uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
2116 	uint64_t shift_width, pte_size;
2117 	struct vcpu_reg_state *vrs;
2118 
2119 	vrs = &exit->vrs;
2120 
2121 	if (!pa)
2122 		return (EINVAL);
2123 
2124 	if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
2125 		log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
2126 		*pa = va;
2127 		return (0);
2128 	}
2129 
2130 	pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
2131 
2132 	log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
2133 	    vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
2134 
2135 	if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
2136 		if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
2137 			pte_size = sizeof(uint64_t);
2138 			shift_width = 9;
2139 
2140 			if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
2141 				/* 4 level paging */
2142 				level = 4;
2143 				mask = L4_MASK;
2144 				shift = L4_SHIFT;
2145 			} else {
2146 				/* 32 bit with PAE paging */
2147 				level = 3;
2148 				mask = L3_MASK;
2149 				shift = L3_SHIFT;
2150 			}
2151 		} else {
2152 			/* 32 bit paging */
2153 			level = 2;
2154 			shift_width = 10;
2155 			mask = 0xFFC00000;
2156 			shift = 22;
2157 			pte_size = sizeof(uint32_t);
2158 		}
2159 	} else
2160 		return (EINVAL);
2161 
2162 	/* XXX: Check for R bit in segment selector and set A bit */
2163 
2164 	for (;level > 0; level--) {
2165 		pdidx = (va & mask) >> shift;
2166 		pte_paddr = (pt_paddr) + (pdidx * pte_size);
2167 
2168 		log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
2169 		    level, pte_paddr);
2170 		if (read_mem(pte_paddr, &pte, pte_size)) {
2171 			log_warn("%s: failed to read pte", __func__);
2172 			return (EFAULT);
2173 		}
2174 
2175 		log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
2176 		    pte);
2177 
2178 		/* XXX: Set CR2  */
2179 		if (!(pte & PG_V))
2180 			return (EFAULT);
2181 
2182 		/* XXX: Check for SMAP */
2183 		if ((mode == PROT_WRITE) && !(pte & PG_RW))
2184 			return (EPERM);
2185 
2186 		if ((exit->cpl > 0) && !(pte & PG_u))
2187 			return (EPERM);
2188 
2189 		pte = pte | PG_U;
2190 		if (mode == PROT_WRITE)
2191 			pte = pte | PG_M;
2192 		if (write_mem(pte_paddr, &pte, pte_size)) {
2193 			log_warn("%s: failed to write back flags to pte",
2194 			    __func__);
2195 			return (EIO);
2196 		}
2197 
2198 		/* XXX: EINVAL if in 32bit and  PG_PS is 1 but CR4.PSE is 0 */
2199 		if (pte & PG_PS)
2200 			break;
2201 
2202 		if (level > 1) {
2203 			pt_paddr = pte & PG_FRAME;
2204 			shift -= shift_width;
2205 			mask = mask >> shift_width;
2206 		}
2207 	}
2208 
2209 	low_mask = (1 << shift) - 1;
2210 	high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
2211 	*pa = (pte & high_mask) | (va & low_mask);
2212 
2213 	log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
2214 
2215 	return (0);
2216 }
2217