xref: /openbsd-src/usr.sbin/vmd/vm.c (revision 46035553bfdd96e63c94e32da0210227ec2e3cf1)
1 /*	$OpenBSD: vm.c,v 1.58 2020/06/28 16:52:45 pd Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/ioctl.h>
21 #include <sys/queue.h>
22 #include <sys/wait.h>
23 #include <sys/uio.h>
24 #include <sys/socket.h>
25 #include <sys/time.h>
26 #include <sys/mman.h>
27 
28 #include <dev/ic/i8253reg.h>
29 #include <dev/isa/isareg.h>
30 #include <dev/pci/pcireg.h>
31 
32 #include <machine/param.h>
33 #include <machine/psl.h>
34 #include <machine/pte.h>
35 #include <machine/specialreg.h>
36 #include <machine/vmmvar.h>
37 
38 #include <net/if.h>
39 
40 #include <errno.h>
41 #include <event.h>
42 #include <fcntl.h>
43 #include <imsg.h>
44 #include <limits.h>
45 #include <poll.h>
46 #include <pthread.h>
47 #include <stddef.h>
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include <unistd.h>
52 #include <util.h>
53 
54 #include "vmd.h"
55 #include "vmm.h"
56 #include "loadfile.h"
57 #include "pci.h"
58 #include "virtio.h"
59 #include "proc.h"
60 #include "i8253.h"
61 #include "i8259.h"
62 #include "ns8250.h"
63 #include "mc146818.h"
64 #include "fw_cfg.h"
65 #include "atomicio.h"
66 
67 io_fn_t ioports_map[MAX_PORTS];
68 
69 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
70     struct vmop_create_params *, struct vcpu_reg_state *);
71 void vm_dispatch_vmm(int, short, void *);
72 void *event_thread(void *);
73 void *vcpu_run_loop(void *);
74 int vcpu_exit(struct vm_run_params *);
75 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
76 void create_memory_map(struct vm_create_params *);
77 int alloc_guest_mem(struct vm_create_params *);
78 int vmm_create_vm(struct vm_create_params *);
79 void init_emulated_hw(struct vmop_create_params *, int,
80     int[][VM_MAX_BASE_PER_DISK], int *);
81 void restore_emulated_hw(struct vm_create_params *, int, int *,
82     int[][VM_MAX_BASE_PER_DISK],int);
83 void vcpu_exit_inout(struct vm_run_params *);
84 int vcpu_exit_eptviolation(struct vm_run_params *);
85 uint8_t vcpu_exit_pci(struct vm_run_params *);
86 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
87 int loadfile_bios(FILE *, struct vcpu_reg_state *);
88 int send_vm(int, struct vm_create_params *);
89 int dump_send_header(int);
90 int dump_vmr(int , struct vm_mem_range *);
91 int dump_mem(int, struct vm_create_params *);
92 void restore_vmr(int, struct vm_mem_range *);
93 void restore_mem(int, struct vm_create_params *);
94 int restore_vm_params(int, struct vm_create_params *);
95 void pause_vm(struct vm_create_params *);
96 void unpause_vm(struct vm_create_params *);
97 
98 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
99 
100 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
101     size_t);
102 
103 int con_fd;
104 struct vmd_vm *current_vm;
105 
106 extern struct vmd *env;
107 
108 extern char *__progname;
109 
110 pthread_mutex_t threadmutex;
111 pthread_cond_t threadcond;
112 
113 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
114 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
115 pthread_barrier_t vm_pause_barrier;
116 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM];
117 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM];
118 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
119 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
120 
121 /*
122  * Represents a standard register set for an OS to be booted
123  * as a flat 64 bit address space.
124  *
125  * NOT set here are:
126  *  RIP
127  *  RSP
128  *  GDTR BASE
129  *
130  * Specific bootloaders should clone this structure and override
131  * those fields as needed.
132  *
133  * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
134  *        features of the CPU in use.
135  */
136 static const struct vcpu_reg_state vcpu_init_flat64 = {
137 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
138 	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
139 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
140 	.vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
141 	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
142 	.vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
143 	.vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
144 	.vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
145 	.vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
146 	.vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
147 	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
148 	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
149 	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
150 	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
151 	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
152 	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
153 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
154 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
155 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
156 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
157 	.vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
158 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
159 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
160 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
161 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
162 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
163 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
164 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
165 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
166 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
167 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
168 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
169 	.vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
170 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
171 };
172 
173 /*
174  * Represents a standard register set for an BIOS to be booted
175  * as a flat 16 bit address space.
176  */
177 static const struct vcpu_reg_state vcpu_init_flat16 = {
178 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
179 	.vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
180 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
181 	.vrs_crs[VCPU_REGS_CR0] = 0x60000010,
182 	.vrs_crs[VCPU_REGS_CR3] = 0,
183 	.vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
184 	.vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
185 	.vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
186 	.vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
187 	.vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
188 	.vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
189 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
190 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
191 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
192 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
193 	.vrs_msrs[VCPU_REGS_EFER] = 0ULL,
194 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
195 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
196 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
197 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
198 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
199 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
200 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
201 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
202 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
203 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
204 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
205 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
206 };
207 
208 /*
209  * loadfile_bios
210  *
211  * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
212  * directly into memory.
213  *
214  * Parameters:
215  *  fp: file of a kernel file to load
216  *  (out) vrs: register state to set on init for this kernel
217  *
218  * Return values:
219  *  0 if successful
220  *  various error codes returned from read(2) or loadelf functions
221  */
222 int
223 loadfile_bios(FILE *fp, struct vcpu_reg_state *vrs)
224 {
225 	off_t	 size, off;
226 
227 	/* Set up a "flat 16 bit" register state for BIOS */
228 	memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
229 
230 	/* Get the size of the BIOS image and seek to the beginning */
231 	if (fseeko(fp, 0, SEEK_END) == -1 || (size = ftello(fp)) == -1 ||
232 	    fseeko(fp, 0, SEEK_SET) == -1)
233 		return (-1);
234 
235 	/* The BIOS image must end at 1M */
236 	if ((off = 1048576 - size) < 0)
237 		return (-1);
238 
239 	/* Read BIOS image into memory */
240 	if (mread(fp, off, size) != (size_t)size) {
241 		errno = EIO;
242 		return (-1);
243 	}
244 
245 	log_debug("%s: loaded BIOS image", __func__);
246 
247 	return (0);
248 }
249 
250 /*
251  * start_vm
252  *
253  * After forking a new VM process, starts the new VM with the creation
254  * parameters supplied (in the incoming vm->vm_params field). This
255  * function performs a basic sanity check on the incoming parameters
256  * and then performs the following steps to complete the creation of the VM:
257  *
258  * 1. validates and create the new VM
259  * 2. opens the imsg control channel to the parent and drops more privilege
260  * 3. drops additional privleges by calling pledge(2)
261  * 4. loads the kernel from the disk image or file descriptor
262  * 5. runs the VM's VCPU loops.
263  *
264  * Parameters:
265  *  vm: The VM data structure that is including the VM create parameters.
266  *  fd: The imsg socket that is connected to the parent process.
267  *
268  * Return values:
269  *  0: success
270  *  !0 : failure - typically an errno indicating the source of the failure
271  */
272 int
273 start_vm(struct vmd_vm *vm, int fd)
274 {
275 	struct vmop_create_params *vmc = &vm->vm_params;
276 	struct vm_create_params	*vcp = &vmc->vmc_params;
277 	struct vcpu_reg_state	 vrs;
278 	int			 nicfds[VMM_MAX_NICS_PER_VM];
279 	int			 ret;
280 	FILE			*fp;
281 	struct vmboot_params	 vmboot;
282 	size_t			 i;
283 	struct vm_rwregs_params  vrp;
284 
285 	/* Child */
286 	setproctitle("%s", vcp->vcp_name);
287 	log_procinit(vcp->vcp_name);
288 
289 	if (!(vm->vm_state & VM_STATE_RECEIVED))
290 		create_memory_map(vcp);
291 
292 	ret = alloc_guest_mem(vcp);
293 
294 	if (ret) {
295 		errno = ret;
296 		fatal("could not allocate guest memory - exiting");
297 	}
298 
299 	ret = vmm_create_vm(vcp);
300 	current_vm = vm;
301 
302 	/* send back the kernel-generated vm id (0 on error) */
303 	if (write(fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
304 	    sizeof(vcp->vcp_id))
305 		fatal("write vcp id");
306 
307 	if (ret) {
308 		errno = ret;
309 		fatal("create vmm ioctl failed - exiting");
310 	}
311 
312 	/*
313 	 * pledge in the vm processes:
314 	 * stdio - for malloc and basic I/O including events.
315 	 * recvfd - for send/recv.
316 	 * vmm - for the vmm ioctls and operations.
317 	 */
318 	if (pledge("stdio vmm recvfd", NULL) == -1)
319 		fatal("pledge");
320 
321 	if (vm->vm_state & VM_STATE_RECEIVED) {
322 		ret = read(vm->vm_receive_fd, &vrp, sizeof(vrp));
323 		if (ret != sizeof(vrp)) {
324 			fatal("received incomplete vrp - exiting");
325 		}
326 		vrs = vrp.vrwp_regs;
327 	} else {
328 		/*
329 		 * Set up default "flat 64 bit" register state - RIP,
330 		 * RSP, and GDT info will be set in bootloader
331 		 */
332 		memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs));
333 
334 		/* Find and open kernel image */
335 		if ((fp = vmboot_open(vm->vm_kernel,
336 		    vm->vm_disks[0], vmc->vmc_diskbases[0],
337 		    vmc->vmc_disktypes[0], &vmboot)) == NULL)
338 			fatalx("failed to open kernel - exiting");
339 
340 		/* Load kernel image */
341 		ret = loadfile_elf(fp, vcp, &vrs,
342 		    vmboot.vbp_bootdev, vmboot.vbp_howto, vmc->vmc_bootdevice);
343 
344 		/*
345 		 * Try BIOS as a fallback (only if it was provided as an image
346 		 * with vm->vm_kernel and not loaded from the disk)
347 		 */
348 		if (ret && errno == ENOEXEC && vm->vm_kernel != -1)
349 			ret = loadfile_bios(fp, &vrs);
350 
351 		if (ret)
352 			fatal("failed to load kernel or BIOS - exiting");
353 
354 		vmboot_close(fp, &vmboot);
355 	}
356 
357 	if (vm->vm_kernel != -1)
358 		close(vm->vm_kernel);
359 
360 	con_fd = vm->vm_tty;
361 	if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
362 		fatal("failed to set nonblocking mode on console");
363 
364 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
365 		nicfds[i] = vm->vm_ifs[i].vif_fd;
366 
367 	event_init();
368 
369 	if (vm->vm_state & VM_STATE_RECEIVED) {
370 		restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
371 		    vm->vm_disks, vm->vm_cdrom);
372 		restore_mem(vm->vm_receive_fd, vcp);
373 		if (restore_vm_params(vm->vm_receive_fd, vcp))
374 			fatal("restore vm params failed");
375 		unpause_vm(vcp);
376 	}
377 
378 	if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
379 		fatal("setup vm pipe");
380 
381 	/* Execute the vcpu run loop(s) for this VM */
382 	ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs);
383 
384 	/* Ensure that any in-flight data is written back */
385 	virtio_shutdown(vm);
386 
387 	return (ret);
388 }
389 
390 /*
391  * vm_dispatch_vmm
392  *
393  * imsg callback for messages that are received from the vmm parent process.
394  */
395 void
396 vm_dispatch_vmm(int fd, short event, void *arg)
397 {
398 	struct vmd_vm		*vm = arg;
399 	struct vmop_result	 vmr;
400 	struct imsgev		*iev = &vm->vm_iev;
401 	struct imsgbuf		*ibuf = &iev->ibuf;
402 	struct imsg		 imsg;
403 	ssize_t			 n;
404 	int			 verbose;
405 
406 	if (event & EV_READ) {
407 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
408 			fatal("%s: imsg_read", __func__);
409 		if (n == 0)
410 			_exit(0);
411 	}
412 
413 	if (event & EV_WRITE) {
414 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
415 			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
416 		if (n == 0)
417 			_exit(0);
418 	}
419 
420 	for (;;) {
421 		if ((n = imsg_get(ibuf, &imsg)) == -1)
422 			fatal("%s: imsg_get", __func__);
423 		if (n == 0)
424 			break;
425 
426 #if DEBUG > 1
427 		log_debug("%s: got imsg %d from %s",
428 		    __func__, imsg.hdr.type,
429 		    vm->vm_params.vmc_params.vcp_name);
430 #endif
431 
432 		switch (imsg.hdr.type) {
433 		case IMSG_CTL_VERBOSE:
434 			IMSG_SIZE_CHECK(&imsg, &verbose);
435 			memcpy(&verbose, imsg.data, sizeof(verbose));
436 			log_setverbose(verbose);
437 			break;
438 		case IMSG_VMDOP_VM_SHUTDOWN:
439 			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
440 				_exit(0);
441 			break;
442 		case IMSG_VMDOP_VM_REBOOT:
443 			if (vmmci_ctl(VMMCI_REBOOT) == -1)
444 				_exit(0);
445 			break;
446 		case IMSG_VMDOP_PAUSE_VM:
447 			vmr.vmr_result = 0;
448 			vmr.vmr_id = vm->vm_vmid;
449 			pause_vm(&vm->vm_params.vmc_params);
450 			imsg_compose_event(&vm->vm_iev,
451 			    IMSG_VMDOP_PAUSE_VM_RESPONSE,
452 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
453 			    sizeof(vmr));
454 			break;
455 		case IMSG_VMDOP_UNPAUSE_VM:
456 			vmr.vmr_result = 0;
457 			vmr.vmr_id = vm->vm_vmid;
458 			unpause_vm(&vm->vm_params.vmc_params);
459 			imsg_compose_event(&vm->vm_iev,
460 			    IMSG_VMDOP_UNPAUSE_VM_RESPONSE,
461 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
462 			    sizeof(vmr));
463 			break;
464 		case IMSG_VMDOP_SEND_VM_REQUEST:
465 			vmr.vmr_id = vm->vm_vmid;
466 			vmr.vmr_result = send_vm(imsg.fd,
467 			    &vm->vm_params.vmc_params);
468 			imsg_compose_event(&vm->vm_iev,
469 			    IMSG_VMDOP_SEND_VM_RESPONSE,
470 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
471 			    sizeof(vmr));
472 			if (!vmr.vmr_result) {
473 				imsg_flush(&current_vm->vm_iev.ibuf);
474 				_exit(0);
475 			}
476 			break;
477 		default:
478 			fatalx("%s: got invalid imsg %d from %s",
479 			    __func__, imsg.hdr.type,
480 			    vm->vm_params.vmc_params.vcp_name);
481 		}
482 		imsg_free(&imsg);
483 	}
484 	imsg_event_add(iev);
485 }
486 
487 /*
488  * vm_ctl
489  *
490  * Tell the vmm parent process to shutdown or reboot the VM and exit.
491  */
492 __dead void
493 vm_shutdown(unsigned int cmd)
494 {
495 	switch (cmd) {
496 	case VMMCI_NONE:
497 	case VMMCI_SHUTDOWN:
498 		(void)imsg_compose_event(&current_vm->vm_iev,
499 		    IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0);
500 		break;
501 	case VMMCI_REBOOT:
502 		(void)imsg_compose_event(&current_vm->vm_iev,
503 		    IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0);
504 		break;
505 	default:
506 		fatalx("invalid vm ctl command: %d", cmd);
507 	}
508 	imsg_flush(&current_vm->vm_iev.ibuf);
509 
510 	_exit(0);
511 }
512 
513 int
514 send_vm(int fd, struct vm_create_params *vcp)
515 {
516 	struct vm_rwregs_params	   vrp;
517 	struct vm_rwvmparams_params vpp;
518 	struct vmop_create_params *vmc;
519 	struct vm_terminate_params vtp;
520 	unsigned int		   flags = 0;
521 	unsigned int		   i;
522 	int			   ret = 0;
523 	size_t			   sz;
524 
525 	if (dump_send_header(fd)) {
526 		log_info("%s: failed to send vm dump header", __func__);
527 		goto err;
528 	}
529 
530 	pause_vm(vcp);
531 
532 	vmc = calloc(1, sizeof(struct vmop_create_params));
533 	if (vmc == NULL) {
534 		log_warn("%s: calloc error geting vmc", __func__);
535 		ret = -1;
536 		goto err;
537 	}
538 
539 	flags |= VMOP_CREATE_MEMORY;
540 	memcpy(&vmc->vmc_params, &current_vm->vm_params, sizeof(struct
541 	    vmop_create_params));
542 	vmc->vmc_flags = flags;
543 	vrp.vrwp_vm_id = vcp->vcp_id;
544 	vrp.vrwp_mask = VM_RWREGS_ALL;
545 	vpp.vpp_mask = VM_RWVMPARAMS_ALL;
546 	vpp.vpp_vm_id = vcp->vcp_id;
547 
548 	sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params));
549 	if (sz != sizeof(struct vmop_create_params)) {
550 		ret = -1;
551 		goto err;
552 	}
553 
554 	for (i = 0; i < vcp->vcp_ncpus; i++) {
555 		vrp.vrwp_vcpu_id = i;
556 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) {
557 			log_warn("%s: readregs failed", __func__);
558 			goto err;
559 		}
560 
561 		sz = atomicio(vwrite, fd, &vrp,
562 		    sizeof(struct vm_rwregs_params));
563 		if (sz != sizeof(struct vm_rwregs_params)) {
564 			log_warn("%s: dumping registers failed", __func__);
565 			ret = -1;
566 			goto err;
567 		}
568 	}
569 
570 	if ((ret = i8253_dump(fd)))
571 		goto err;
572 	if ((ret = i8259_dump(fd)))
573 		goto err;
574 	if ((ret = ns8250_dump(fd)))
575 		goto err;
576 	if ((ret = mc146818_dump(fd)))
577 		goto err;
578 	if ((ret = fw_cfg_dump(fd)))
579 		goto err;
580 	if ((ret = pci_dump(fd)))
581 		goto err;
582 	if ((ret = virtio_dump(fd)))
583 		goto err;
584 	if ((ret = dump_mem(fd, vcp)))
585 		goto err;
586 
587 	for (i = 0; i < vcp->vcp_ncpus; i++) {
588 		vpp.vpp_vcpu_id = i;
589 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) {
590 			log_warn("%s: readvmparams failed", __func__);
591 			goto err;
592 		}
593 
594 		sz = atomicio(vwrite, fd, &vpp,
595 		    sizeof(struct vm_rwvmparams_params));
596 		if (sz != sizeof(struct vm_rwvmparams_params)) {
597 			log_warn("%s: dumping vm params failed", __func__);
598 			ret = -1;
599 			goto err;
600 		}
601 	}
602 
603 	vtp.vtp_vm_id = vcp->vcp_id;
604 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) {
605 		log_warnx("%s: term IOC error: %d, %d", __func__,
606 		    errno, ENOENT);
607 	}
608 err:
609 	close(fd);
610 	if (ret)
611 		unpause_vm(vcp);
612 	return ret;
613 }
614 
615 int
616 dump_send_header(int fd) {
617 	struct vm_dump_header	   vmh;
618 	int			   i;
619 
620 	memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
621 	    sizeof(vmh.vmh_signature));
622 
623 	vmh.vmh_cpuids[0].code = 0x00;
624 	vmh.vmh_cpuids[0].leaf = 0x00;
625 
626 	vmh.vmh_cpuids[1].code = 0x01;
627 	vmh.vmh_cpuids[1].leaf = 0x00;
628 
629 	vmh.vmh_cpuids[2].code = 0x07;
630 	vmh.vmh_cpuids[2].leaf = 0x00;
631 
632 	vmh.vmh_cpuids[3].code = 0x0d;
633 	vmh.vmh_cpuids[3].leaf = 0x00;
634 
635 	vmh.vmh_cpuids[4].code = 0x80000001;
636 	vmh.vmh_cpuids[4].leaf = 0x00;
637 
638 	vmh.vmh_version = VM_DUMP_VERSION;
639 
640 	for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
641 		CPUID_LEAF(vmh.vmh_cpuids[i].code,
642 		    vmh.vmh_cpuids[i].leaf,
643 		    vmh.vmh_cpuids[i].a,
644 		    vmh.vmh_cpuids[i].b,
645 		    vmh.vmh_cpuids[i].c,
646 		    vmh.vmh_cpuids[i].d);
647 	}
648 
649 	if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
650 		return (-1);
651 
652 	return (0);
653 }
654 
655 int
656 dump_mem(int fd, struct vm_create_params *vcp)
657 {
658 	unsigned int	i;
659 	int		ret;
660 	struct		vm_mem_range *vmr;
661 
662 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
663 		vmr = &vcp->vcp_memranges[i];
664 		ret = dump_vmr(fd, vmr);
665 		if (ret)
666 			return ret;
667 	}
668 	return (0);
669 }
670 
671 int
672 restore_vm_params(int fd, struct vm_create_params *vcp) {
673 	unsigned int			i;
674 	struct vm_rwvmparams_params    vpp;
675 
676 	for (i = 0; i < vcp->vcp_ncpus; i++) {
677 		if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) {
678 			log_warn("%s: error restoring vm params", __func__);
679 			return (-1);
680 		}
681 		vpp.vpp_vm_id = vcp->vcp_id;
682 		vpp.vpp_vcpu_id = i;
683 		if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) {
684 			log_debug("%s: writing vm params failed", __func__);
685 			return (-1);
686 		}
687 	}
688 	return (0);
689 }
690 
691 void
692 restore_mem(int fd, struct vm_create_params *vcp)
693 {
694 	unsigned int	     i;
695 	struct vm_mem_range *vmr;
696 
697 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
698 		vmr = &vcp->vcp_memranges[i];
699 		restore_vmr(fd, vmr);
700 	}
701 }
702 
703 int
704 dump_vmr(int fd, struct vm_mem_range *vmr)
705 {
706 	size_t	rem = vmr->vmr_size, read=0;
707 	char	buf[PAGE_SIZE];
708 
709 	while (rem > 0) {
710 		if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) {
711 			log_warn("failed to read vmr");
712 			return (-1);
713 		}
714 		if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) {
715 			log_warn("failed to dump vmr");
716 			return (-1);
717 		}
718 		rem = rem - PAGE_SIZE;
719 		read = read + PAGE_SIZE;
720 	}
721 	return (0);
722 }
723 
724 void
725 restore_vmr(int fd, struct vm_mem_range *vmr)
726 {
727 	size_t	rem = vmr->vmr_size, wrote=0;
728 	char	buf[PAGE_SIZE];
729 
730 	while (rem > 0) {
731 		if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf))
732 			fatal("failed to restore vmr");
733 		if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE))
734 			fatal("failed to write vmr");
735 		rem = rem - PAGE_SIZE;
736 		wrote = wrote + PAGE_SIZE;
737 	}
738 }
739 
740 void
741 pause_vm(struct vm_create_params *vcp)
742 {
743 	unsigned int n;
744 	int ret;
745 	if (current_vm->vm_state & VM_STATE_PAUSED)
746 		return;
747 
748 	current_vm->vm_state |= VM_STATE_PAUSED;
749 
750 	ret = pthread_barrier_init(&vm_pause_barrier, NULL, vcp->vcp_ncpus + 1);
751 	if (ret) {
752 		log_warnx("%s: cannot initialize pause barrier (%d)",
753 		    __progname, ret);
754 		return;
755 	}
756 
757 	for (n = 0; n < vcp->vcp_ncpus; n++) {
758 		ret = pthread_cond_broadcast(&vcpu_run_cond[n]);
759 		if (ret) {
760 			log_warnx("%s: can't broadcast vcpu run cond (%d)",
761 			    __func__, (int)ret);
762 			return;
763 		}
764 	}
765 	ret = pthread_barrier_wait(&vm_pause_barrier);
766 	if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
767 		log_warnx("%s: could not wait on pause barrier (%d)",
768 		    __func__, (int)ret);
769 		return;
770 	}
771 
772 	ret = pthread_barrier_destroy(&vm_pause_barrier);
773 	if (ret) {
774 		log_warnx("%s: could not destroy pause barrier (%d)",
775 		    __progname, ret);
776 		return;
777 	}
778 
779 	i8253_stop();
780 	mc146818_stop();
781 	ns8250_stop();
782 	virtio_stop(vcp);
783 }
784 
785 void
786 unpause_vm(struct vm_create_params *vcp)
787 {
788 	unsigned int n;
789 	int ret;
790 	if (!(current_vm->vm_state & VM_STATE_PAUSED))
791 		return;
792 
793 	current_vm->vm_state &= ~VM_STATE_PAUSED;
794 	for (n = 0; n < vcp->vcp_ncpus; n++) {
795 		ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]);
796 		if (ret) {
797 			log_warnx("%s: can't broadcast vcpu unpause cond (%d)",
798 			    __func__, (int)ret);
799 			return;
800 		}
801 	}
802 
803 	i8253_start();
804 	mc146818_start();
805 	ns8250_start();
806 	virtio_start(vcp);
807 }
808 
809 /*
810  * vcpu_reset
811  *
812  * Requests vmm(4) to reset the VCPUs in the indicated VM to
813  * the register state provided
814  *
815  * Parameters
816  *  vmid: VM ID to reset
817  *  vcpu_id: VCPU ID to reset
818  *  vrs: the register state to initialize
819  *
820  * Return values:
821  *  0: success
822  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
823  *      valid)
824  */
825 int
826 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
827 {
828 	struct vm_resetcpu_params vrp;
829 
830 	memset(&vrp, 0, sizeof(vrp));
831 	vrp.vrp_vm_id = vmid;
832 	vrp.vrp_vcpu_id = vcpu_id;
833 	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
834 
835 	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
836 
837 	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1)
838 		return (errno);
839 
840 	return (0);
841 }
842 
843 /*
844  * create_memory_map
845  *
846  * Sets up the guest physical memory ranges that the VM can access.
847  *
848  * Parameters:
849  *  vcp: VM create parameters describing the VM whose memory map
850  *       is being created
851  *
852  * Return values:
853  *  nothing
854  */
855 void
856 create_memory_map(struct vm_create_params *vcp)
857 {
858 	size_t len, mem_bytes, mem_mb;
859 
860 	mem_mb = vcp->vcp_memranges[0].vmr_size;
861 	vcp->vcp_nmemranges = 0;
862 	if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE)
863 		return;
864 
865 	mem_bytes = mem_mb * 1024 * 1024;
866 
867 	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
868 	len = LOWMEM_KB * 1024;
869 	vcp->vcp_memranges[0].vmr_gpa = 0x0;
870 	vcp->vcp_memranges[0].vmr_size = len;
871 	mem_bytes -= len;
872 
873 	/*
874 	 * Second memory region: LOWMEM_KB - 1MB.
875 	 *
876 	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
877 	 * We have to add this region, because some systems
878 	 * unconditionally write to 0xb8000 (VGA RAM), and
879 	 * we need to make sure that vmm(4) permits accesses
880 	 * to it. So allocate guest memory for it.
881 	 */
882 	len = 0x100000 - LOWMEM_KB * 1024;
883 	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
884 	vcp->vcp_memranges[1].vmr_size = len;
885 	mem_bytes -= len;
886 
887 	/* Make sure that we do not place physical memory into MMIO ranges. */
888 	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000)
889 		len = VMM_PCI_MMIO_BAR_BASE - 0x100000;
890 	else
891 		len = mem_bytes;
892 
893 	/* Third memory region: 1MB - (1MB + len) */
894 	vcp->vcp_memranges[2].vmr_gpa = 0x100000;
895 	vcp->vcp_memranges[2].vmr_size = len;
896 	mem_bytes -= len;
897 
898 	if (mem_bytes > 0) {
899 		/* Fourth memory region for the remaining memory (if any) */
900 		vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
901 		vcp->vcp_memranges[3].vmr_size = mem_bytes;
902 		vcp->vcp_nmemranges = 4;
903 	} else
904 		vcp->vcp_nmemranges = 3;
905 }
906 
907 /*
908  * alloc_guest_mem
909  *
910  * Allocates memory for the guest.
911  * Instead of doing a single allocation with one mmap(), we allocate memory
912  * separately for every range for the following reasons:
913  * - ASLR for the individual ranges
914  * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
915  *   map the single mmap'd userspace memory to the individual guest physical
916  *   memory ranges, the underlying amap of the single mmap'd range would have
917  *   to allocate per-page reference counters. The reason is that the
918  *   individual guest physical ranges would reference the single mmap'd region
919  *   only partially. However, if every guest physical range has its own
920  *   corresponding mmap'd userspace allocation, there are no partial
921  *   references: every guest physical range fully references an mmap'd
922  *   range => no per-page reference counters have to be allocated.
923  *
924  * Return values:
925  *  0: success
926  *  !0: failure - errno indicating the source of the failure
927  */
928 int
929 alloc_guest_mem(struct vm_create_params *vcp)
930 {
931 	void *p;
932 	int ret;
933 	size_t i, j;
934 	struct vm_mem_range *vmr;
935 
936 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
937 		vmr = &vcp->vcp_memranges[i];
938 		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
939 		    MAP_PRIVATE | MAP_ANON, -1, 0);
940 		if (p == MAP_FAILED) {
941 			ret = errno;
942 			for (j = 0; j < i; j++) {
943 				vmr = &vcp->vcp_memranges[j];
944 				munmap((void *)vmr->vmr_va, vmr->vmr_size);
945 			}
946 
947 			return (ret);
948 		}
949 
950 		vmr->vmr_va = (vaddr_t)p;
951 	}
952 
953 	return (0);
954 }
955 
956 /*
957  * vmm_create_vm
958  *
959  * Requests vmm(4) to create a new VM using the supplied creation
960  * parameters. This operation results in the creation of the in-kernel
961  * structures for the VM, but does not start the VM's vcpu(s).
962  *
963  * Parameters:
964  *  vcp: vm_create_params struct containing the VM's desired creation
965  *      configuration
966  *
967  * Return values:
968  *  0: success
969  *  !0 : ioctl to vmm(4) failed
970  */
971 int
972 vmm_create_vm(struct vm_create_params *vcp)
973 {
974 	/* Sanity check arguments */
975 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
976 		return (EINVAL);
977 
978 	if (vcp->vcp_nmemranges == 0 ||
979 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
980 		return (EINVAL);
981 
982 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
983 		return (EINVAL);
984 
985 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
986 		return (EINVAL);
987 
988 	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1)
989 		return (errno);
990 
991 	return (0);
992 }
993 
994 /*
995  * init_emulated_hw
996  *
997  * Initializes the userspace hardware emulation
998  */
999 void
1000 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
1001     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1002 {
1003 	struct vm_create_params *vcp = &vmc->vmc_params;
1004 	int i;
1005 	uint64_t memlo, memhi;
1006 
1007 	/* Calculate memory size for NVRAM registers */
1008 	memlo = memhi = 0;
1009 	if (vcp->vcp_nmemranges > 2)
1010 		memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000;
1011 
1012 	if (vcp->vcp_nmemranges > 3)
1013 		memhi = vcp->vcp_memranges[3].vmr_size;
1014 
1015 	/* Reset the IO port map */
1016 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1017 
1018 	/* Init i8253 PIT */
1019 	i8253_init(vcp->vcp_id);
1020 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1021 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1022 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1023 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1024 	ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
1025 
1026 	/* Init mc146818 RTC */
1027 	mc146818_init(vcp->vcp_id, memlo, memhi);
1028 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1029 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1030 
1031 	/* Init master and slave PICs */
1032 	i8259_init();
1033 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1034 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1035 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1036 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1037 	ioports_map[ELCR0] = vcpu_exit_elcr;
1038 	ioports_map[ELCR1] = vcpu_exit_elcr;
1039 
1040 	/* Init ns8250 UART */
1041 	ns8250_init(con_fd, vcp->vcp_id);
1042 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1043 		ioports_map[i] = vcpu_exit_com;
1044 
1045 	/* Init QEMU fw_cfg interface */
1046 	fw_cfg_init(vmc);
1047 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1048 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1049 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1050 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1051 
1052 	/* Initialize PCI */
1053 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
1054 		ioports_map[i] = vcpu_exit_pci;
1055 
1056 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1057 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1058 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1059 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1060 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1061 	pci_init();
1062 
1063 	/* Initialize virtio devices */
1064 	virtio_init(current_vm, child_cdrom, child_disks, child_taps);
1065 }
1066 /*
1067  * restore_emulated_hw
1068  *
1069  * Restores the userspace hardware emulation from fd
1070  */
1071 void
1072 restore_emulated_hw(struct vm_create_params *vcp, int fd,
1073     int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
1074 {
1075 	/* struct vm_create_params *vcp = &vmc->vmc_params; */
1076 	int i;
1077 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1078 
1079 	/* Init i8253 PIT */
1080 	i8253_restore(fd, vcp->vcp_id);
1081 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1082 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1083 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1084 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1085 
1086 	/* Init master and slave PICs */
1087 	i8259_restore(fd);
1088 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1089 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1090 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1091 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1092 
1093 	/* Init ns8250 UART */
1094 	ns8250_restore(fd, con_fd, vcp->vcp_id);
1095 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1096 		ioports_map[i] = vcpu_exit_com;
1097 
1098 	/* Init mc146818 RTC */
1099 	mc146818_restore(fd, vcp->vcp_id);
1100 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1101 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1102 
1103 	/* Init QEMU fw_cfg interface */
1104 	fw_cfg_restore(fd);
1105 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1106 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1107 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1108 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1109 
1110 	/* Initialize PCI */
1111 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
1112 		ioports_map[i] = vcpu_exit_pci;
1113 
1114 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1115 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1116 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1117 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1118 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1119 	pci_restore(fd);
1120 	virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
1121 }
1122 
1123 /*
1124  * run_vm
1125  *
1126  * Runs the VM whose creation parameters are specified in vcp
1127  *
1128  * Parameters:
1129  *  child_cdrom: previously-opened child ISO disk file descriptor
1130  *  child_disks: previously-opened child VM disk file file descriptors
1131  *  child_taps: previously-opened child tap file descriptors
1132  *  vmc: vmop_create_params struct containing the VM's desired creation
1133  *      configuration
1134  *  vrs: VCPU register state to initialize
1135  *
1136  * Return values:
1137  *  0: the VM exited normally
1138  *  !0 : the VM exited abnormally or failed to start
1139  */
1140 int
1141 run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK],
1142     int *child_taps, struct vmop_create_params *vmc,
1143     struct vcpu_reg_state *vrs)
1144 {
1145 	struct vm_create_params *vcp = &vmc->vmc_params;
1146 	struct vm_rwregs_params vregsp;
1147 	uint8_t evdone = 0;
1148 	size_t i;
1149 	int ret;
1150 	pthread_t *tid, evtid;
1151 	struct vm_run_params **vrp;
1152 	void *exit_status;
1153 
1154 	if (vcp == NULL)
1155 		return (EINVAL);
1156 
1157 	if (child_cdrom == -1 && strlen(vcp->vcp_cdrom))
1158 		return (EINVAL);
1159 
1160 	if (child_disks == NULL && vcp->vcp_ndisks != 0)
1161 		return (EINVAL);
1162 
1163 	if (child_taps == NULL && vcp->vcp_nnics != 0)
1164 		return (EINVAL);
1165 
1166 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
1167 		return (EINVAL);
1168 
1169 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
1170 		return (EINVAL);
1171 
1172 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
1173 		return (EINVAL);
1174 
1175 	if (vcp->vcp_nmemranges == 0 ||
1176 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1177 		return (EINVAL);
1178 
1179 	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
1180 	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
1181 	if (tid == NULL || vrp == NULL) {
1182 		log_warn("%s: memory allocation error - exiting.",
1183 		    __progname);
1184 		return (ENOMEM);
1185 	}
1186 
1187 	log_debug("%s: initializing hardware for vm %s", __func__,
1188 	    vcp->vcp_name);
1189 
1190 	if (!(current_vm->vm_state & VM_STATE_RECEIVED))
1191 		init_emulated_hw(vmc, child_cdrom, child_disks, child_taps);
1192 
1193 	ret = pthread_mutex_init(&threadmutex, NULL);
1194 	if (ret) {
1195 		log_warn("%s: could not initialize thread state mutex",
1196 		    __func__);
1197 		return (ret);
1198 	}
1199 	ret = pthread_cond_init(&threadcond, NULL);
1200 	if (ret) {
1201 		log_warn("%s: could not initialize thread state "
1202 		    "condition variable", __func__);
1203 		return (ret);
1204 	}
1205 
1206 	mutex_lock(&threadmutex);
1207 
1208 	log_debug("%s: starting vcpu threads for vm %s", __func__,
1209 	    vcp->vcp_name);
1210 
1211 	/*
1212 	 * Create and launch one thread for each VCPU. These threads may
1213 	 * migrate between PCPUs over time; the need to reload CPU state
1214 	 * in such situations is detected and performed by vmm(4) in the
1215 	 * kernel.
1216 	 */
1217 	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
1218 		vrp[i] = malloc(sizeof(struct vm_run_params));
1219 		if (vrp[i] == NULL) {
1220 			log_warn("%s: memory allocation error - "
1221 			    "exiting.", __progname);
1222 			/* caller will exit, so skip freeing */
1223 			return (ENOMEM);
1224 		}
1225 		vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit));
1226 		if (vrp[i]->vrp_exit == NULL) {
1227 			log_warn("%s: memory allocation error - "
1228 			    "exiting.", __progname);
1229 			/* caller will exit, so skip freeing */
1230 			return (ENOMEM);
1231 		}
1232 		vrp[i]->vrp_vm_id = vcp->vcp_id;
1233 		vrp[i]->vrp_vcpu_id = i;
1234 
1235 		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
1236 			log_warnx("%s: cannot reset VCPU %zu - exiting.",
1237 			    __progname, i);
1238 			return (EIO);
1239 		}
1240 
1241 		/* once more because reset_cpu changes regs */
1242 		if (current_vm->vm_state & VM_STATE_RECEIVED) {
1243 			vregsp.vrwp_vm_id = vcp->vcp_id;
1244 			vregsp.vrwp_vcpu_id = i;
1245 			vregsp.vrwp_regs = *vrs;
1246 			vregsp.vrwp_mask = VM_RWREGS_ALL;
1247 			if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS,
1248 			    &vregsp)) == -1) {
1249 				log_warn("%s: writeregs failed", __func__);
1250 				return (ret);
1251 			}
1252 		}
1253 
1254 		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
1255 		if (ret) {
1256 			log_warnx("%s: cannot initialize cond var (%d)",
1257 			    __progname, ret);
1258 			return (ret);
1259 		}
1260 
1261 		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
1262 		if (ret) {
1263 			log_warnx("%s: cannot initialize mtx (%d)",
1264 			    __progname, ret);
1265 			return (ret);
1266 		}
1267 
1268 		ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL);
1269 		if (ret) {
1270 			log_warnx("%s: cannot initialize unpause var (%d)",
1271 			    __progname, ret);
1272 			return (ret);
1273 		}
1274 
1275 		ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL);
1276 		if (ret) {
1277 			log_warnx("%s: cannot initialize unpause mtx (%d)",
1278 			    __progname, ret);
1279 			return (ret);
1280 		}
1281 
1282 		vcpu_hlt[i] = 0;
1283 
1284 		/* Start each VCPU run thread at vcpu_run_loop */
1285 		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
1286 		if (ret) {
1287 			/* caller will _exit after this return */
1288 			ret = errno;
1289 			log_warn("%s: could not create vcpu thread %zu",
1290 			    __func__, i);
1291 			return (ret);
1292 		}
1293 	}
1294 
1295 	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
1296 	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
1297 	if (ret) {
1298 		errno = ret;
1299 		log_warn("%s: could not create event thread", __func__);
1300 		return (ret);
1301 	}
1302 
1303 	for (;;) {
1304 		ret = pthread_cond_wait(&threadcond, &threadmutex);
1305 		if (ret) {
1306 			log_warn("%s: waiting on thread state condition "
1307 			    "variable failed", __func__);
1308 			return (ret);
1309 		}
1310 
1311 		/*
1312 		 * Did a VCPU thread exit with an error? => return the first one
1313 		 */
1314 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1315 			if (vcpu_done[i] == 0)
1316 				continue;
1317 
1318 			if (pthread_join(tid[i], &exit_status)) {
1319 				log_warn("%s: failed to join thread %zd - "
1320 				    "exiting", __progname, i);
1321 				return (EIO);
1322 			}
1323 
1324 			ret = (intptr_t)exit_status;
1325 		}
1326 
1327 		/* Did the event thread exit? => return with an error */
1328 		if (evdone) {
1329 			if (pthread_join(evtid, &exit_status)) {
1330 				log_warn("%s: failed to join event thread - "
1331 				    "exiting", __progname);
1332 				return (EIO);
1333 			}
1334 
1335 			log_warnx("%s: vm %d event thread exited "
1336 			    "unexpectedly", __progname, vcp->vcp_id);
1337 			return (EIO);
1338 		}
1339 
1340 		/* Did all VCPU threads exit successfully? => return */
1341 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1342 			if (vcpu_done[i] == 0)
1343 				break;
1344 		}
1345 		if (i == vcp->vcp_ncpus)
1346 			return (ret);
1347 
1348 		/* Some more threads to wait for, start over */
1349 	}
1350 
1351 	return (ret);
1352 }
1353 
1354 void *
1355 event_thread(void *arg)
1356 {
1357 	uint8_t *donep = arg;
1358 	intptr_t ret;
1359 
1360 	ret = event_dispatch();
1361 
1362 	mutex_lock(&threadmutex);
1363 	*donep = 1;
1364 	pthread_cond_signal(&threadcond);
1365 	mutex_unlock(&threadmutex);
1366 
1367 	return (void *)ret;
1368  }
1369 
1370 /*
1371  * vcpu_run_loop
1372  *
1373  * Runs a single VCPU until vmm(4) requires help handling an exit,
1374  * or the VM terminates.
1375  *
1376  * Parameters:
1377  *  arg: vcpu_run_params for the VCPU being run by this thread
1378  *
1379  * Return values:
1380  *  NULL: the VCPU shutdown properly
1381  *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1382  */
1383 void *
1384 vcpu_run_loop(void *arg)
1385 {
1386 	struct vm_run_params *vrp = (struct vm_run_params *)arg;
1387 	intptr_t ret = 0;
1388 	int irq;
1389 	uint32_t n;
1390 
1391 	vrp->vrp_continue = 0;
1392 	n = vrp->vrp_vcpu_id;
1393 
1394 	for (;;) {
1395 		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1396 
1397 		if (ret) {
1398 			log_warnx("%s: can't lock vcpu run mtx (%d)",
1399 			    __func__, (int)ret);
1400 			return ((void *)ret);
1401 		}
1402 
1403 		/* If we are halted and need to pause, pause */
1404 		if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) {
1405 			ret = pthread_barrier_wait(&vm_pause_barrier);
1406 			if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
1407 				log_warnx("%s: could not wait on pause barrier (%d)",
1408 				    __func__, (int)ret);
1409 				return ((void *)ret);
1410 			}
1411 
1412 			ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]);
1413 			if (ret) {
1414 				log_warnx("%s: can't lock vcpu unpause mtx (%d)",
1415 				    __func__, (int)ret);
1416 				return ((void *)ret);
1417 			}
1418 
1419 			ret = pthread_cond_wait(&vcpu_unpause_cond[n],
1420 			    &vcpu_unpause_mtx[n]);
1421 			if (ret) {
1422 				log_warnx(
1423 				    "%s: can't wait on unpause cond (%d)",
1424 				    __func__, (int)ret);
1425 				break;
1426 			}
1427 			ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]);
1428 			if (ret) {
1429 				log_warnx("%s: can't unlock unpause mtx (%d)",
1430 				    __func__, (int)ret);
1431 				break;
1432 			}
1433 		}
1434 
1435 		/* If we are halted and not paused, wait */
1436 		if (vcpu_hlt[n]) {
1437 			ret = pthread_cond_wait(&vcpu_run_cond[n],
1438 			    &vcpu_run_mtx[n]);
1439 
1440 			if (ret) {
1441 				log_warnx(
1442 				    "%s: can't wait on cond (%d)",
1443 				    __func__, (int)ret);
1444 				(void)pthread_mutex_unlock(
1445 				    &vcpu_run_mtx[n]);
1446 				break;
1447 			}
1448 		}
1449 
1450 		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1451 
1452 		if (ret) {
1453 			log_warnx("%s: can't unlock mutex on cond (%d)",
1454 			    __func__, (int)ret);
1455 			break;
1456 		}
1457 
1458 		if (vrp->vrp_irqready && i8259_is_pending()) {
1459 			irq = i8259_ack();
1460 			vrp->vrp_irq = irq;
1461 		} else
1462 			vrp->vrp_irq = 0xFFFF;
1463 
1464 		/* Still more pending? */
1465 		if (i8259_is_pending()) {
1466 			/* XXX can probably avoid ioctls here by providing intr in vrp */
1467 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1468 			    vrp->vrp_vcpu_id, 1)) {
1469 				fatal("can't set INTR");
1470 			}
1471 		} else {
1472 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1473 			    vrp->vrp_vcpu_id, 0)) {
1474 				fatal("can't clear INTR");
1475 			}
1476 		}
1477 
1478 		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) {
1479 			/* If run ioctl failed, exit */
1480 			ret = errno;
1481 			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1482 			    __func__, vrp->vrp_vm_id, n);
1483 			break;
1484 		}
1485 
1486 		/* If the VM is terminating, exit normally */
1487 		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
1488 			ret = (intptr_t)NULL;
1489 			break;
1490 		}
1491 
1492 		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
1493 			/*
1494 			 * vmm(4) needs help handling an exit, handle in
1495 			 * vcpu_exit.
1496 			 */
1497 			ret = vcpu_exit(vrp);
1498 			if (ret)
1499 				break;
1500 		}
1501 	}
1502 
1503 	mutex_lock(&threadmutex);
1504 	vcpu_done[n] = 1;
1505 	pthread_cond_signal(&threadcond);
1506 	mutex_unlock(&threadmutex);
1507 
1508 	return ((void *)ret);
1509 }
1510 
1511 int
1512 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1513 {
1514 	struct vm_intr_params vip;
1515 
1516 	memset(&vip, 0, sizeof(vip));
1517 
1518 	vip.vip_vm_id = vm_id;
1519 	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1520 	vip.vip_intr = intr;
1521 
1522 	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1)
1523 		return (errno);
1524 
1525 	return (0);
1526 }
1527 
1528 /*
1529  * vcpu_exit_pci
1530  *
1531  * Handle all I/O to the emulated PCI subsystem.
1532  *
1533  * Parameters:
1534  *  vrp: vcpu run paramters containing guest state for this exit
1535  *
1536  * Return value:
1537  *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
1538  *      be injected.
1539  */
1540 uint8_t
1541 vcpu_exit_pci(struct vm_run_params *vrp)
1542 {
1543 	struct vm_exit *vei = vrp->vrp_exit;
1544 	uint8_t intr;
1545 
1546 	intr = 0xFF;
1547 
1548 	switch (vei->vei.vei_port) {
1549 	case PCI_MODE1_ADDRESS_REG:
1550 		pci_handle_address_reg(vrp);
1551 		break;
1552 	case PCI_MODE1_DATA_REG:
1553 	case PCI_MODE1_DATA_REG + 1:
1554 	case PCI_MODE1_DATA_REG + 2:
1555 	case PCI_MODE1_DATA_REG + 3:
1556 		pci_handle_data_reg(vrp);
1557 		break;
1558 	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
1559 		intr = pci_handle_io(vrp);
1560 		break;
1561 	default:
1562 		log_warnx("%s: unknown PCI register 0x%llx",
1563 		    __progname, (uint64_t)vei->vei.vei_port);
1564 		break;
1565 	}
1566 
1567 	return (intr);
1568 }
1569 
1570 /*
1571  * vcpu_exit_inout
1572  *
1573  * Handle all I/O exits that need to be emulated in vmd. This includes the
1574  * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
1575  *
1576  * Parameters:
1577  *  vrp: vcpu run parameters containing guest state for this exit
1578  */
1579 void
1580 vcpu_exit_inout(struct vm_run_params *vrp)
1581 {
1582 	struct vm_exit *vei = vrp->vrp_exit;
1583 	uint8_t intr = 0xFF;
1584 
1585 	if (ioports_map[vei->vei.vei_port] != NULL)
1586 		intr = ioports_map[vei->vei.vei_port](vrp);
1587 	else if (vei->vei.vei_dir == VEI_DIR_IN)
1588 			set_return_data(vei, 0xFFFFFFFF);
1589 
1590 	if (intr != 0xFF)
1591 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
1592 }
1593 
1594 /*
1595  * vcpu_exit_eptviolation
1596  *
1597  * handle an EPT Violation
1598  *
1599  *
1600  * Parameters:
1601  *  vrp: vcpu run parameters containing guest state for this exit
1602  *
1603  * Return values:
1604  *  0: no action required
1605  *  EAGAIN: a protection fault occured, kill the vm.
1606  */
1607 int
1608 vcpu_exit_eptviolation(struct vm_run_params *vrp)
1609 {
1610 	struct vm_exit *ve = vrp->vrp_exit;
1611 	/*
1612 	 * vmd may be exiting to vmd to handle a pending interrupt
1613 	 * but last exit type may have bee VMX_EXIT_EPT_VIOLATION,
1614 	 * check the fault_type to ensure we really are processing
1615 	 * a VMX_EXIT_EPT_VIOLATION.
1616 	 */
1617 	if (ve->vee.vee_fault_type == VEE_FAULT_PROTECT) {
1618 		log_debug("%s: EPT Violation: rip=0x%llx",
1619 		    __progname, vrp->vrp_exit->vrs.vrs_gprs[VCPU_REGS_RIP]);
1620 		return (EAGAIN);
1621 	}
1622 
1623 	return (0);
1624 }
1625 
1626 /*
1627  * vcpu_exit
1628  *
1629  * Handle a vcpu exit. This function is called when it is determined that
1630  * vmm(4) requires the assistance of vmd to support a particular guest
1631  * exit type (eg, accessing an I/O port or device). Guest state is contained
1632  * in 'vrp', and will be resent to vmm(4) on exit completion.
1633  *
1634  * Upon conclusion of handling the exit, the function determines if any
1635  * interrupts should be injected into the guest, and asserts the proper
1636  * IRQ line whose interrupt should be vectored.
1637  *
1638  * Parameters:
1639  *  vrp: vcpu run parameters containing guest state for this exit
1640  *
1641  * Return values:
1642  *  0: the exit was handled successfully
1643  *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
1644  */
1645 int
1646 vcpu_exit(struct vm_run_params *vrp)
1647 {
1648 	int ret;
1649 
1650 	switch (vrp->vrp_exit_reason) {
1651 	case VMX_EXIT_INT_WINDOW:
1652 	case SVM_VMEXIT_VINTR:
1653 	case VMX_EXIT_CPUID:
1654 	case VMX_EXIT_EXTINT:
1655 	case SVM_VMEXIT_INTR:
1656 	case SVM_VMEXIT_NPF:
1657 	case SVM_VMEXIT_MSR:
1658 	case SVM_VMEXIT_CPUID:
1659 		/*
1660 		 * We may be exiting to vmd to handle a pending interrupt but
1661 		 * at the same time the last exit type may have been one of
1662 		 * these. In this case, there's nothing extra to be done
1663 		 * here (and falling through to the default case below results
1664 		 * in more vmd log spam).
1665 		 */
1666 		break;
1667 	case VMX_EXIT_EPT_VIOLATION:
1668 		ret = vcpu_exit_eptviolation(vrp);
1669 		if (ret)
1670 			return (ret);
1671 
1672 		break;
1673 	case VMX_EXIT_IO:
1674 	case SVM_VMEXIT_IOIO:
1675 		vcpu_exit_inout(vrp);
1676 		break;
1677 	case VMX_EXIT_HLT:
1678 	case SVM_VMEXIT_HLT:
1679 		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1680 		if (ret) {
1681 			log_warnx("%s: can't lock vcpu mutex (%d)",
1682 			    __func__, ret);
1683 			return (ret);
1684 		}
1685 		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
1686 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1687 		if (ret) {
1688 			log_warnx("%s: can't unlock vcpu mutex (%d)",
1689 			    __func__, ret);
1690 			return (ret);
1691 		}
1692 		break;
1693 	case VMX_EXIT_TRIPLE_FAULT:
1694 	case SVM_VMEXIT_SHUTDOWN:
1695 		/* reset VM */
1696 		return (EAGAIN);
1697 	default:
1698 		log_debug("%s: unknown exit reason 0x%x",
1699 		    __progname, vrp->vrp_exit_reason);
1700 	}
1701 
1702 	/* Process any pending traffic */
1703 	vionet_process_rx(vrp->vrp_vm_id);
1704 
1705 	vrp->vrp_continue = 1;
1706 
1707 	return (0);
1708 }
1709 
1710 /*
1711  * find_gpa_range
1712  *
1713  * Search for a contiguous guest physical mem range.
1714  *
1715  * Parameters:
1716  *  vcp: VM create parameters that contain the memory map to search in
1717  *  gpa: the starting guest physical address
1718  *  len: the length of the memory range
1719  *
1720  * Return values:
1721  *  NULL: on failure if there is no memory range as described by the parameters
1722  *  Pointer to vm_mem_range that contains the start of the range otherwise.
1723  */
1724 static struct vm_mem_range *
1725 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
1726 {
1727 	size_t i, n;
1728 	struct vm_mem_range *vmr;
1729 
1730 	/* Find the first vm_mem_range that contains gpa */
1731 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1732 		vmr = &vcp->vcp_memranges[i];
1733 		if (vmr->vmr_gpa + vmr->vmr_size >= gpa)
1734 			break;
1735 	}
1736 
1737 	/* No range found. */
1738 	if (i == vcp->vcp_nmemranges)
1739 		return (NULL);
1740 
1741 	/*
1742 	 * vmr may cover the range [gpa, gpa + len) only partly. Make
1743 	 * sure that the following vm_mem_ranges are contiguous and
1744 	 * cover the rest.
1745 	 */
1746 	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
1747 	if (len < n)
1748 		len = 0;
1749 	else
1750 		len -= n;
1751 	gpa = vmr->vmr_gpa + vmr->vmr_size;
1752 	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
1753 		vmr = &vcp->vcp_memranges[i];
1754 		if (gpa != vmr->vmr_gpa)
1755 			return (NULL);
1756 		if (len <= vmr->vmr_size)
1757 			len = 0;
1758 		else
1759 			len -= vmr->vmr_size;
1760 
1761 		gpa = vmr->vmr_gpa + vmr->vmr_size;
1762 	}
1763 
1764 	if (len != 0)
1765 		return (NULL);
1766 
1767 	return (vmr);
1768 }
1769 
1770 void *
1771 vaddr_mem(paddr_t gpa, size_t len)
1772 {
1773 	struct vm_create_params *vcp = &current_vm->vm_params.vmc_params;
1774 	size_t i;
1775 	struct vm_mem_range *vmr;
1776 	paddr_t gpend = gpa + len;
1777 
1778 	/* Find the first vm_mem_range that contains gpa */
1779 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1780 		vmr = &vcp->vcp_memranges[i];
1781 		if (gpa < vmr->vmr_gpa)
1782 			continue;
1783 
1784 		if (gpend >= vmr->vmr_gpa + vmr->vmr_size)
1785 			continue;
1786 
1787 		return ((char *)vmr->vmr_va + (gpa - vmr->vmr_gpa));
1788 	}
1789 
1790 	return (NULL);
1791 }
1792 
1793 /*
1794  * write_mem
1795  *
1796  * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
1797  *
1798  * Parameters:
1799  *  dst: the destination paddr_t in the guest VM
1800  *  buf: data to copy (or NULL to zero the data)
1801  *  len: number of bytes to copy
1802  *
1803  * Return values:
1804  *  0: success
1805  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1806  *      exist in the guest.
1807  */
1808 int
1809 write_mem(paddr_t dst, const void *buf, size_t len)
1810 {
1811 	const char *from = buf;
1812 	char *to;
1813 	size_t n, off;
1814 	struct vm_mem_range *vmr;
1815 
1816 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
1817 	if (vmr == NULL) {
1818 		errno = EINVAL;
1819 		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
1820 		    "len = 0x%zx", __func__, dst, len);
1821 		return (EINVAL);
1822 	}
1823 
1824 	off = dst - vmr->vmr_gpa;
1825 	while (len != 0) {
1826 		n = vmr->vmr_size - off;
1827 		if (len < n)
1828 			n = len;
1829 
1830 		to = (char *)vmr->vmr_va + off;
1831 		if (buf == NULL)
1832 			memset(to, 0, n);
1833 		else {
1834 			memcpy(to, from, n);
1835 			from += n;
1836 		}
1837 		len -= n;
1838 		off = 0;
1839 		vmr++;
1840 	}
1841 
1842 	return (0);
1843 }
1844 
1845 /*
1846  * read_mem
1847  *
1848  * Reads memory at guest paddr 'src' into 'buf'.
1849  *
1850  * Parameters:
1851  *  src: the source paddr_t in the guest VM to read from.
1852  *  buf: destination (local) buffer
1853  *  len: number of bytes to read
1854  *
1855  * Return values:
1856  *  0: success
1857  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1858  *      exist in the guest.
1859  */
1860 int
1861 read_mem(paddr_t src, void *buf, size_t len)
1862 {
1863 	char *from, *to = buf;
1864 	size_t n, off;
1865 	struct vm_mem_range *vmr;
1866 
1867 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
1868 	if (vmr == NULL) {
1869 		errno = EINVAL;
1870 		log_warn("%s: failed - invalid memory range src = 0x%lx, "
1871 		    "len = 0x%zx", __func__, src, len);
1872 		return (EINVAL);
1873 	}
1874 
1875 	off = src - vmr->vmr_gpa;
1876 	while (len != 0) {
1877 		n = vmr->vmr_size - off;
1878 		if (len < n)
1879 			n = len;
1880 
1881 		from = (char *)vmr->vmr_va + off;
1882 		memcpy(to, from, n);
1883 
1884 		to += n;
1885 		len -= n;
1886 		off = 0;
1887 		vmr++;
1888 	}
1889 
1890 	return (0);
1891 }
1892 
1893 int
1894 iovec_mem(paddr_t src, size_t len, struct iovec *iov, int iovcnt)
1895 {
1896 	size_t n, off;
1897 	struct vm_mem_range *vmr;
1898 	int niov = 0;
1899 
1900 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
1901 	if (vmr == NULL) {
1902 		errno = EINVAL;
1903 		return (-1);
1904 	}
1905 
1906 	off = src - vmr->vmr_gpa;
1907 	while (len > 0) {
1908 		if (niov == iovcnt) {
1909 			errno = ENOMEM;
1910 			return (-1);
1911 		}
1912 
1913 		n = vmr->vmr_size - off;
1914 		if (len < n)
1915 			n = len;
1916 
1917 		iov[niov].iov_base = (char *)vmr->vmr_va + off;
1918 		iov[niov].iov_len = n;
1919 
1920 		niov++;
1921 
1922 		len -= n;
1923 		off = 0;
1924 		vmr++;
1925 	}
1926 
1927 	return (niov);
1928 }
1929 
1930 /*
1931  * vcpu_assert_pic_irq
1932  *
1933  * Injects the specified IRQ on the supplied vcpu/vm
1934  *
1935  * Parameters:
1936  *  vm_id: VM ID to inject to
1937  *  vcpu_id: VCPU ID to inject to
1938  *  irq: IRQ to inject
1939  */
1940 void
1941 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1942 {
1943 	int ret;
1944 
1945 	i8259_assert_irq(irq);
1946 
1947 	if (i8259_is_pending()) {
1948 		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
1949 			fatalx("%s: can't assert INTR", __func__);
1950 
1951 		ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
1952 		if (ret)
1953 			fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
1954 
1955 		vcpu_hlt[vcpu_id] = 0;
1956 		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
1957 		if (ret)
1958 			fatalx("%s: can't signal (%d)", __func__, ret);
1959 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
1960 		if (ret)
1961 			fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
1962 	}
1963 }
1964 
1965 /*
1966  * vcpu_deassert_pic_irq
1967  *
1968  * Clears the specified IRQ on the supplied vcpu/vm
1969  *
1970  * Parameters:
1971  *  vm_id: VM ID to clear in
1972  *  vcpu_id: VCPU ID to clear in
1973  *  irq: IRQ to clear
1974  */
1975 void
1976 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1977 {
1978 	i8259_deassert_irq(irq);
1979 
1980 	if (!i8259_is_pending()) {
1981 		if (vcpu_pic_intr(vm_id, vcpu_id, 0))
1982 			fatalx("%s: can't deassert INTR for vm_id %d, "
1983 			    "vcpu_id %d", __func__, vm_id, vcpu_id);
1984 	}
1985 }
1986 
1987 /*
1988  * fd_hasdata
1989  *
1990  * Determines if data can be read from a file descriptor.
1991  *
1992  * Parameters:
1993  *  fd: the fd to check
1994  *
1995  * Return values:
1996  *  1 if data can be read from an fd, or 0 otherwise.
1997  */
1998 int
1999 fd_hasdata(int fd)
2000 {
2001 	struct pollfd pfd[1];
2002 	int nready, hasdata = 0;
2003 
2004 	pfd[0].fd = fd;
2005 	pfd[0].events = POLLIN;
2006 	nready = poll(pfd, 1, 0);
2007 	if (nready == -1)
2008 		log_warn("checking file descriptor for data failed");
2009 	else if (nready == 1 && pfd[0].revents & POLLIN)
2010 		hasdata = 1;
2011 	return (hasdata);
2012 }
2013 
2014 /*
2015  * mutex_lock
2016  *
2017  * Wrapper function for pthread_mutex_lock that does error checking and that
2018  * exits on failure
2019  */
2020 void
2021 mutex_lock(pthread_mutex_t *m)
2022 {
2023 	int ret;
2024 
2025 	ret = pthread_mutex_lock(m);
2026 	if (ret) {
2027 		errno = ret;
2028 		fatal("could not acquire mutex");
2029 	}
2030 }
2031 
2032 /*
2033  * mutex_unlock
2034  *
2035  * Wrapper function for pthread_mutex_unlock that does error checking and that
2036  * exits on failure
2037  */
2038 void
2039 mutex_unlock(pthread_mutex_t *m)
2040 {
2041 	int ret;
2042 
2043 	ret = pthread_mutex_unlock(m);
2044 	if (ret) {
2045 		errno = ret;
2046 		fatal("could not release mutex");
2047 	}
2048 }
2049 
2050 /*
2051  * set_return_data
2052  *
2053  * Utility function for manipulating register data in vm exit info structs. This
2054  * function ensures that the data is copied to the vei->vei.vei_data field with
2055  * the proper size for the operation being performed.
2056  *
2057  * Parameters:
2058  *  vei: exit information
2059  *  data: return data
2060  */
2061 void
2062 set_return_data(struct vm_exit *vei, uint32_t data)
2063 {
2064 	switch (vei->vei.vei_size) {
2065 	case 1:
2066 		vei->vei.vei_data &= ~0xFF;
2067 		vei->vei.vei_data |= (uint8_t)data;
2068 		break;
2069 	case 2:
2070 		vei->vei.vei_data &= ~0xFFFF;
2071 		vei->vei.vei_data |= (uint16_t)data;
2072 		break;
2073 	case 4:
2074 		vei->vei.vei_data = data;
2075 		break;
2076 	}
2077 }
2078 
2079 /*
2080  * get_input_data
2081  *
2082  * Utility function for manipulating register data in vm exit info
2083  * structs. This function ensures that the data is copied from the
2084  * vei->vei.vei_data field with the proper size for the operation being
2085  * performed.
2086  *
2087  * Parameters:
2088  *  vei: exit information
2089  *  data: location to store the result
2090  */
2091 void
2092 get_input_data(struct vm_exit *vei, uint32_t *data)
2093 {
2094 	switch (vei->vei.vei_size) {
2095 	case 1:
2096 		*data &= 0xFFFFFF00;
2097 		*data |= (uint8_t)vei->vei.vei_data;
2098 		break;
2099 	case 2:
2100 		*data &= 0xFFFF0000;
2101 		*data |= (uint16_t)vei->vei.vei_data;
2102 		break;
2103 	case 4:
2104 		*data = vei->vei.vei_data;
2105 		break;
2106 	default:
2107 		log_warnx("%s: invalid i/o size %d", __func__,
2108 		    vei->vei.vei_size);
2109 	}
2110 
2111 }
2112 
2113 /*
2114  * translate_gva
2115  *
2116  * Translates a guest virtual address to a guest physical address by walking
2117  * the currently active page table (if needed).
2118  *
2119  * Note - this function can possibly alter the supplied VCPU state.
2120  *  Specifically, it may inject exceptions depending on the current VCPU
2121  *  configuration, and may alter %cr2 on #PF. Consequently, this function
2122  *  should only be used as part of instruction emulation.
2123  *
2124  * Parameters:
2125  *  exit: The VCPU this translation should be performed for (guest MMU settings
2126  *   are gathered from this VCPU)
2127  *  va: virtual address to translate
2128  *  pa: pointer to paddr_t variable that will receive the translated physical
2129  *   address. 'pa' is unchanged on error.
2130  *  mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
2131  *   the address should be translated
2132  *
2133  * Return values:
2134  *  0: the address was successfully translated - 'pa' contains the physical
2135  *     address currently mapped by 'va'.
2136  *  EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
2137  *     and %cr2 set in the vcpu structure.
2138  *  EINVAL: an error occurred reading paging table structures
2139  */
2140 int
2141 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
2142 {
2143 	int level, shift, pdidx;
2144 	uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
2145 	uint64_t shift_width, pte_size;
2146 	struct vcpu_reg_state *vrs;
2147 
2148 	vrs = &exit->vrs;
2149 
2150 	if (!pa)
2151 		return (EINVAL);
2152 
2153 	if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
2154 		log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
2155 		*pa = va;
2156 		return (0);
2157 	}
2158 
2159 	pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
2160 
2161 	log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
2162 	    vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
2163 
2164 	if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
2165 		if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
2166 			pte_size = sizeof(uint64_t);
2167 			shift_width = 9;
2168 
2169 			if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
2170 				/* 4 level paging */
2171 				level = 4;
2172 				mask = L4_MASK;
2173 				shift = L4_SHIFT;
2174 			} else {
2175 				/* 32 bit with PAE paging */
2176 				level = 3;
2177 				mask = L3_MASK;
2178 				shift = L3_SHIFT;
2179 			}
2180 		} else {
2181 			/* 32 bit paging */
2182 			level = 2;
2183 			shift_width = 10;
2184 			mask = 0xFFC00000;
2185 			shift = 22;
2186 			pte_size = sizeof(uint32_t);
2187 		}
2188 	} else
2189 		return (EINVAL);
2190 
2191 	/* XXX: Check for R bit in segment selector and set A bit */
2192 
2193 	for (;level > 0; level--) {
2194 		pdidx = (va & mask) >> shift;
2195 		pte_paddr = (pt_paddr) + (pdidx * pte_size);
2196 
2197 		log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
2198 		    level, pte_paddr);
2199 		if (read_mem(pte_paddr, &pte, pte_size)) {
2200 			log_warn("%s: failed to read pte", __func__);
2201 			return (EFAULT);
2202 		}
2203 
2204 		log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
2205 		    pte);
2206 
2207 		/* XXX: Set CR2  */
2208 		if (!(pte & PG_V))
2209 			return (EFAULT);
2210 
2211 		/* XXX: Check for SMAP */
2212 		if ((mode == PROT_WRITE) && !(pte & PG_RW))
2213 			return (EPERM);
2214 
2215 		if ((exit->cpl > 0) && !(pte & PG_u))
2216 			return (EPERM);
2217 
2218 		pte = pte | PG_U;
2219 		if (mode == PROT_WRITE)
2220 			pte = pte | PG_M;
2221 		if (write_mem(pte_paddr, &pte, pte_size)) {
2222 			log_warn("%s: failed to write back flags to pte",
2223 			    __func__);
2224 			return (EIO);
2225 		}
2226 
2227 		/* XXX: EINVAL if in 32bit and  PG_PS is 1 but CR4.PSE is 0 */
2228 		if (pte & PG_PS)
2229 			break;
2230 
2231 		if (level > 1) {
2232 			pt_paddr = pte & PG_FRAME;
2233 			shift -= shift_width;
2234 			mask = mask >> shift_width;
2235 		}
2236 	}
2237 
2238 	low_mask = (1 << shift) - 1;
2239 	high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
2240 	*pa = (pte & high_mask) | (va & low_mask);
2241 
2242 	log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
2243 
2244 	return (0);
2245 }
2246 
2247 /*
2248  * vm_pipe_init
2249  *
2250  * Initialize a vm_dev_pipe, setting up its file descriptors and its
2251  * event structure with the given callback.
2252  *
2253  * Parameters:
2254  *  p: pointer to vm_dev_pipe struct to initizlize
2255  *  cb: callback to use for READ events on the read end of the pipe
2256  */
2257 void
2258 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *))
2259 {
2260 	int ret;
2261 	int fds[2];
2262 
2263 	memset(p, 0, sizeof(struct vm_dev_pipe));
2264 
2265 	ret = pipe(fds);
2266 	if (ret)
2267 		fatal("failed to create vm_dev_pipe pipe");
2268 
2269 	p->read = fds[0];
2270 	p->write = fds[1];
2271 
2272 	event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL);
2273 }
2274 
2275 /*
2276  * vm_pipe_send
2277  *
2278  * Send a message to an emulated device vie the provided vm_dev_pipe.
2279  *
2280  * Parameters:
2281  *  p: pointer to initialized vm_dev_pipe
2282  *  msg: message to send in the channel
2283  */
2284 void
2285 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg)
2286 {
2287 	size_t n;
2288 	n = write(p->write, &msg, sizeof(msg));
2289 	if (n != sizeof(msg))
2290 		fatal("failed to write to device pipe");
2291 }
2292 
2293 /*
2294  * vm_pipe_recv
2295  *
2296  * Receive a message for an emulated device via the provided vm_dev_pipe.
2297  * Returns the message value, otherwise will exit on failure.
2298  *
2299  * Parameters:
2300  *  p: pointer to initialized vm_dev_pipe
2301  *
2302  * Return values:
2303  *  a value of enum pipe_msg_type or fatal exit on read(2) error
2304  */
2305 enum pipe_msg_type
2306 vm_pipe_recv(struct vm_dev_pipe *p)
2307 {
2308 	size_t n;
2309 	enum pipe_msg_type msg;
2310 	n = read(p->read, &msg, sizeof(msg));
2311 	if (n != sizeof(msg))
2312 		fatal("failed to read from device pipe");
2313 
2314 	return msg;
2315 }
2316