xref: /openbsd-src/usr.sbin/vmd/vm.c (revision 6ca44032e7be0d795b9f13c99fbce059e942c15d)
1 /*	$OpenBSD: vm.c,v 1.88 2023/04/28 19:46:42 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* PAGE_SIZE, MAXCOMLEN */
20 #include <sys/types.h>
21 #include <sys/ioctl.h>
22 #include <sys/queue.h>
23 #include <sys/wait.h>
24 #include <sys/uio.h>
25 #include <sys/stat.h>
26 #include <sys/socket.h>
27 #include <sys/time.h>
28 #include <sys/mman.h>
29 #include <sys/resource.h>
30 
31 #include <dev/ic/i8253reg.h>
32 #include <dev/isa/isareg.h>
33 #include <dev/pci/pcireg.h>
34 
35 #include <machine/psl.h>
36 #include <machine/pte.h>
37 #include <machine/specialreg.h>
38 #include <machine/vmmvar.h>
39 
40 #include <net/if.h>
41 
42 #include <errno.h>
43 #include <event.h>
44 #include <fcntl.h>
45 #include <imsg.h>
46 #include <limits.h>
47 #include <poll.h>
48 #include <pthread.h>
49 #include <pthread_np.h>
50 #include <stddef.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <unistd.h>
55 #include <util.h>
56 
57 #include "atomicio.h"
58 #include "fw_cfg.h"
59 #include "i8253.h"
60 #include "i8259.h"
61 #include "loadfile.h"
62 #include "mc146818.h"
63 #include "mmio.h"
64 #include "ns8250.h"
65 #include "pci.h"
66 #include "virtio.h"
67 #include "vmd.h"
68 #include "vmm.h"
69 
70 #define MB(x)	(x * 1024UL * 1024UL)
71 #define GB(x)	(x * 1024UL * 1024UL * 1024UL)
72 
73 #define MMIO_NOTYET 0
74 
75 io_fn_t ioports_map[MAX_PORTS];
76 
77 static int run_vm(struct vmop_create_params *, struct vcpu_reg_state *);
78 void vm_dispatch_vmm(int, short, void *);
79 void *event_thread(void *);
80 void *vcpu_run_loop(void *);
81 int vcpu_exit(struct vm_run_params *);
82 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
83 void create_memory_map(struct vm_create_params *);
84 static int vmm_create_vm(struct vmd_vm *);
85 int alloc_guest_mem(struct vmd_vm *);
86 void init_emulated_hw(struct vmop_create_params *, int,
87     int[][VM_MAX_BASE_PER_DISK], int *);
88 void restore_emulated_hw(struct vm_create_params *, int, int *,
89     int[][VM_MAX_BASE_PER_DISK],int);
90 void vcpu_exit_inout(struct vm_run_params *);
91 int vcpu_exit_eptviolation(struct vm_run_params *);
92 uint8_t vcpu_exit_pci(struct vm_run_params *);
93 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
94 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
95 static int send_vm(int, struct vmd_vm *);
96 int dump_send_header(int);
97 static int dump_vmr(int , struct vm_mem_range *);
98 static int dump_mem(int, struct vmd_vm *);
99 void restore_vmr(int, struct vm_mem_range *);
100 void restore_mem(int, struct vm_create_params *);
101 int restore_vm_params(int, struct vm_create_params *);
102 static void pause_vm(struct vmd_vm *);
103 static void unpause_vm(struct vmd_vm *);
104 
105 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
106 
107 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
108     size_t);
109 
110 int con_fd;
111 struct vmd_vm *current_vm;
112 
113 extern struct vmd *env;
114 
115 extern char *__progname;
116 
117 pthread_mutex_t threadmutex;
118 pthread_cond_t threadcond;
119 
120 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
121 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
122 pthread_barrier_t vm_pause_barrier;
123 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM];
124 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM];
125 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
126 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
127 
128 /*
129  * Represents a standard register set for an OS to be booted
130  * as a flat 64 bit address space.
131  *
132  * NOT set here are:
133  *  RIP
134  *  RSP
135  *  GDTR BASE
136  *
137  * Specific bootloaders should clone this structure and override
138  * those fields as needed.
139  *
140  * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
141  *        features of the CPU in use.
142  */
143 static const struct vcpu_reg_state vcpu_init_flat64 = {
144 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
145 	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
146 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
147 	.vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
148 	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
149 	.vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
150 	.vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
151 	.vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
152 	.vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
153 	.vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
154 	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
155 	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
156 	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
157 	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
158 	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
159 	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
160 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
161 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
162 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
163 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
164 	.vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
165 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
166 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
167 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
168 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
169 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
170 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
171 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
172 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
173 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
174 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
175 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
176 	.vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
177 	.vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
178 };
179 
180 /*
181  * Represents a standard register set for an BIOS to be booted
182  * as a flat 16 bit address space.
183  */
184 static const struct vcpu_reg_state vcpu_init_flat16 = {
185 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
186 	.vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
187 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
188 	.vrs_crs[VCPU_REGS_CR0] = 0x60000010,
189 	.vrs_crs[VCPU_REGS_CR3] = 0,
190 	.vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
191 	.vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
192 	.vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
193 	.vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
194 	.vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
195 	.vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
196 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
197 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
198 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
199 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
200 	.vrs_msrs[VCPU_REGS_EFER] = 0ULL,
201 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
202 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
203 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
204 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
205 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
206 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
207 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
208 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
209 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
210 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
211 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
212 	.vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
213 };
214 
215 /*
216  * vm_main
217  *
218  * Primary entrypoint for launching a vm. Does not return.
219  *
220  * fd: file descriptor for communicating with vmm process.
221  */
222 void
223 vm_main(int fd)
224 {
225 	struct vm_create_params	*vcp = NULL;
226 	struct vmd_vm		 vm;
227 	size_t			 sz = 0;
228 	int			 ret = 0;
229 
230 	/*
231 	 * We aren't root, so we can't chroot(2). Use unveil(2) instead.
232 	 */
233 	if (unveil(env->argv0, "x") == -1)
234 		fatal("unveil %s", env->argv0);
235 	if (unveil(NULL, NULL) == -1)
236 		fatal("unveil lock");
237 
238 	/*
239 	 * pledge in the vm processes:
240 	 * stdio - for malloc and basic I/O including events.
241 	 * vmm - for the vmm ioctls and operations.
242 	 * proc exec - fork/exec for launching devices.
243 	 * recvfd - for vm send/recv and sending fd to devices.
244 	 * tmppath/rpath - for shm_mkstemp, ftruncate, unlink
245 	 */
246 	if (pledge("stdio vmm proc exec recvfd tmppath rpath", NULL) == -1)
247 		fatal("pledge");
248 
249 	/* Receive our vm configuration. */
250 	memset(&vm, 0, sizeof(vm));
251 	sz = atomicio(read, fd, &vm, sizeof(vm));
252 	if (sz != sizeof(vm)) {
253 		log_warnx("failed to receive start message");
254 		_exit(EIO);
255 	}
256 
257 	/* Receive the /dev/vmm fd number. */
258 	sz = atomicio(read, fd, &env->vmd_fd, sizeof(env->vmd_fd));
259 	if (sz != sizeof(env->vmd_fd)) {
260 		log_warnx("failed to receive /dev/vmm fd");
261 		_exit(EIO);
262 	}
263 
264 	/* Update process with the vm name. */
265 	vcp = &vm.vm_params.vmc_params;
266 	setproctitle("%s", vcp->vcp_name);
267 	log_procinit(vcp->vcp_name);
268 
269 	/*
270 	 * We need, at minimum, a vm_kernel fd to boot a vm. This is either a
271 	 * kernel or a BIOS image.
272 	 */
273 	if (!(vm.vm_state & VM_STATE_RECEIVED)) {
274 		if (vm.vm_kernel == -1) {
275 			log_warnx("%s: failed to receive boot fd",
276 			    vcp->vcp_name);
277 			_exit(EINVAL);
278 		}
279 		if (fcntl(vm.vm_kernel, F_SETFL, O_NONBLOCK) == -1) {
280 			ret = errno;
281 			log_warn("failed to set nonblocking mode on boot fd");
282 			_exit(ret);
283 		}
284 	}
285 
286 	ret = start_vm(&vm, fd);
287 	_exit(ret);
288 }
289 
290 /*
291  * loadfile_bios
292  *
293  * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
294  * directly into memory.
295  *
296  * Parameters:
297  *  fp: file of a kernel file to load
298  *  size: uncompressed size of the image
299  *  (out) vrs: register state to set on init for this kernel
300  *
301  * Return values:
302  *  0 if successful
303  *  various error codes returned from read(2) or loadelf functions
304  */
305 int
306 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
307 {
308 	off_t	 off;
309 
310 	/* Set up a "flat 16 bit" register state for BIOS */
311 	memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
312 
313 	/* Seek to the beginning of the BIOS image */
314 	if (gzseek(fp, 0, SEEK_SET) == -1)
315 		return (-1);
316 
317 	/* The BIOS image must end at 1MB */
318 	if ((off = MB(1) - size) < 0)
319 		return (-1);
320 
321 	/* Read BIOS image into memory */
322 	if (mread(fp, off, size) != (size_t)size) {
323 		errno = EIO;
324 		return (-1);
325 	}
326 
327 	if (gzseek(fp, 0, SEEK_SET) == -1)
328 		return (-1);
329 
330 	/* Read a second BIOS copy into memory ending at 4GB */
331 	off = GB(4) - size;
332 	if (mread(fp, off, size) != (size_t)size) {
333 		errno = EIO;
334 		return (-1);
335 	}
336 
337 	log_debug("%s: loaded BIOS image", __func__);
338 
339 	return (0);
340 }
341 
342 /*
343  * start_vm
344  *
345  * After forking a new VM process, starts the new VM with the creation
346  * parameters supplied (in the incoming vm->vm_params field). This
347  * function performs a basic sanity check on the incoming parameters
348  * and then performs the following steps to complete the creation of the VM:
349  *
350  * 1. validates and create the new VM
351  * 2. opens the imsg control channel to the parent and drops more privilege
352  * 3. drops additional privileges by calling pledge(2)
353  * 4. loads the kernel from the disk image or file descriptor
354  * 5. runs the VM's VCPU loops.
355  *
356  * Parameters:
357  *  vm: The VM data structure that is including the VM create parameters.
358  *  fd: The imsg socket that is connected to the parent process.
359  *
360  * Return values:
361  *  0: success
362  *  !0 : failure - typically an errno indicating the source of the failure
363  */
364 int
365 start_vm(struct vmd_vm *vm, int fd)
366 {
367 	struct vmop_create_params *vmc = &vm->vm_params;
368 	struct vm_create_params	*vcp = &vmc->vmc_params;
369 	struct vcpu_reg_state	 vrs;
370 	int			 nicfds[VM_MAX_NICS_PER_VM];
371 	int			 ret;
372 	gzFile			 fp;
373 	size_t			 i;
374 	struct vm_rwregs_params  vrp;
375 	struct stat		 sb;
376 
377 	/*
378 	 * We first try to initialize and allocate memory before bothering
379 	 * vmm(4) with a request to create a new vm.
380 	 */
381 	if (!(vm->vm_state & VM_STATE_RECEIVED))
382 		create_memory_map(vcp);
383 
384 	ret = alloc_guest_mem(vm);
385 	if (ret) {
386 		struct rlimit lim;
387 		char buf[FMT_SCALED_STRSIZE];
388 		if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) {
389 			if (fmt_scaled(lim.rlim_cur, buf) == 0)
390 				fatalx("could not allocate guest memory (data "
391 				    "limit is %s)", buf);
392 		}
393 		errno = ret;
394 		log_warn("could not allocate guest memory");
395 		return (ret);
396 	}
397 
398 	/* We've allocated guest memory, so now create the vm in vmm(4). */
399 	ret = vmm_create_vm(vm);
400 	if (ret) {
401 		/* Let the vmm process know we failed by sending a 0 vm id. */
402 		vcp->vcp_id = 0;
403 		atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id));
404 		return (ret);
405 	}
406 
407 	/*
408 	 * Some of vmd currently relies on global state (current_vm, con_fd).
409 	 */
410 	current_vm = vm;
411 	con_fd = vm->vm_tty;
412 	if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) {
413 		log_warn("failed to set nonblocking mode on console");
414 		return (1);
415 	}
416 
417 	/*
418 	 * We now let the vmm process know we were successful by sending it our
419 	 * vmm(4) assigned vm id.
420 	 */
421 	if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
422 	    sizeof(vcp->vcp_id)) {
423 		log_warn("failed to send created vm id to vmm process");
424 		return (1);
425 	}
426 
427 	/* Prepare either our boot image or receive an existing vm to launch. */
428 	if (vm->vm_state & VM_STATE_RECEIVED) {
429 		ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp));
430 		if (ret != sizeof(vrp))
431 			fatal("received incomplete vrp - exiting");
432 		vrs = vrp.vrwp_regs;
433 	} else {
434 		/*
435 		 * Set up default "flat 64 bit" register state - RIP,
436 		 * RSP, and GDT info will be set in bootloader
437 		 */
438 		memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs));
439 
440 		/* Find and open kernel image */
441 		if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL)
442 			fatalx("failed to open kernel - exiting");
443 
444 		/* Load kernel image */
445 		ret = loadfile_elf(fp, vm, &vrs, vmc->vmc_bootdevice);
446 
447 		/*
448 		 * Try BIOS as a fallback (only if it was provided as an image
449 		 * with vm->vm_kernel and the file is not compressed)
450 		 */
451 		if (ret && errno == ENOEXEC && vm->vm_kernel != -1 &&
452 		    gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
453 			ret = loadfile_bios(fp, sb.st_size, &vrs);
454 
455 		if (ret)
456 			fatal("failed to load kernel or BIOS - exiting");
457 
458 		gzclose(fp);
459 	}
460 
461 	if (vm->vm_kernel != -1)
462 		close_fd(vm->vm_kernel);
463 
464 	/* Initialize our mutexes. */
465 	ret = pthread_mutex_init(&threadmutex, NULL);
466 	if (ret) {
467 		log_warn("%s: could not initialize thread state mutex",
468 		    __func__);
469 		return (ret);
470 	}
471 	ret = pthread_cond_init(&threadcond, NULL);
472 	if (ret) {
473 		log_warn("%s: could not initialize thread state "
474 		    "condition variable", __func__);
475 		return (ret);
476 	}
477 	mutex_lock(&threadmutex);
478 
479 
480 	/*
481 	 * Finalize our communication socket with the vmm process. From here
482 	 * onwards, communication with the vmm process is event-based.
483 	 */
484 	event_init();
485 	if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
486 		fatal("setup vm pipe");
487 
488 	/*
489 	 * Initialize or restore our emulated hardware.
490 	 */
491 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
492 		nicfds[i] = vm->vm_ifs[i].vif_fd;
493 
494 	if (vm->vm_state & VM_STATE_RECEIVED) {
495 		restore_mem(vm->vm_receive_fd, vcp);
496 		restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
497 		    vm->vm_disks, vm->vm_cdrom);
498 		if (restore_vm_params(vm->vm_receive_fd, vcp))
499 			fatal("restore vm params failed");
500 		unpause_vm(vm);
501 	} else
502 		init_emulated_hw(vmc, vm->vm_cdrom, vm->vm_disks, nicfds);
503 
504 	/* Drop privleges further before starting the vcpu run loop(s). */
505 	if (pledge("stdio vmm recvfd", NULL) == -1)
506 		fatal("pledge");
507 
508 	/*
509 	 * Execute the vcpu run loop(s) for this VM.
510 	 */
511 	ret = run_vm(&vm->vm_params, &vrs);
512 
513 	/* Ensure that any in-flight data is written back */
514 	virtio_shutdown(vm);
515 
516 	return (ret);
517 }
518 
519 /*
520  * vm_dispatch_vmm
521  *
522  * imsg callback for messages that are received from the vmm parent process.
523  */
524 void
525 vm_dispatch_vmm(int fd, short event, void *arg)
526 {
527 	struct vmd_vm		*vm = arg;
528 	struct vmop_result	 vmr;
529 	struct vmop_addr_result	 var;
530 	struct imsgev		*iev = &vm->vm_iev;
531 	struct imsgbuf		*ibuf = &iev->ibuf;
532 	struct imsg		 imsg;
533 	ssize_t			 n;
534 	int			 verbose;
535 
536 	if (event & EV_READ) {
537 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
538 			fatal("%s: imsg_read", __func__);
539 		if (n == 0)
540 			_exit(0);
541 	}
542 
543 	if (event & EV_WRITE) {
544 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
545 			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
546 		if (n == 0)
547 			_exit(0);
548 	}
549 
550 	for (;;) {
551 		if ((n = imsg_get(ibuf, &imsg)) == -1)
552 			fatal("%s: imsg_get", __func__);
553 		if (n == 0)
554 			break;
555 
556 #if DEBUG > 1
557 		log_debug("%s: got imsg %d from %s",
558 		    __func__, imsg.hdr.type,
559 		    vm->vm_params.vmc_params.vcp_name);
560 #endif
561 
562 		switch (imsg.hdr.type) {
563 		case IMSG_CTL_VERBOSE:
564 			IMSG_SIZE_CHECK(&imsg, &verbose);
565 			memcpy(&verbose, imsg.data, sizeof(verbose));
566 			log_setverbose(verbose);
567 			break;
568 		case IMSG_VMDOP_VM_SHUTDOWN:
569 			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
570 				_exit(0);
571 			break;
572 		case IMSG_VMDOP_VM_REBOOT:
573 			if (vmmci_ctl(VMMCI_REBOOT) == -1)
574 				_exit(0);
575 			break;
576 		case IMSG_VMDOP_PAUSE_VM:
577 			vmr.vmr_result = 0;
578 			vmr.vmr_id = vm->vm_vmid;
579 			pause_vm(vm);
580 			imsg_compose_event(&vm->vm_iev,
581 			    IMSG_VMDOP_PAUSE_VM_RESPONSE,
582 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
583 			    sizeof(vmr));
584 			break;
585 		case IMSG_VMDOP_UNPAUSE_VM:
586 			vmr.vmr_result = 0;
587 			vmr.vmr_id = vm->vm_vmid;
588 			unpause_vm(vm);
589 			imsg_compose_event(&vm->vm_iev,
590 			    IMSG_VMDOP_UNPAUSE_VM_RESPONSE,
591 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
592 			    sizeof(vmr));
593 			break;
594 		case IMSG_VMDOP_SEND_VM_REQUEST:
595 			vmr.vmr_id = vm->vm_vmid;
596 			vmr.vmr_result = send_vm(imsg.fd, vm);
597 			imsg_compose_event(&vm->vm_iev,
598 			    IMSG_VMDOP_SEND_VM_RESPONSE,
599 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
600 			    sizeof(vmr));
601 			if (!vmr.vmr_result) {
602 				imsg_flush(&current_vm->vm_iev.ibuf);
603 				_exit(0);
604 			}
605 			break;
606 		case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
607 			IMSG_SIZE_CHECK(&imsg, &var);
608 			memcpy(&var, imsg.data, sizeof(var));
609 
610 			log_debug("%s: received tap addr %s for nic %d",
611 			    vm->vm_params.vmc_params.vcp_name,
612 			    ether_ntoa((void *)var.var_addr), var.var_nic_idx);
613 
614 			vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr);
615 			break;
616 		default:
617 			fatalx("%s: got invalid imsg %d from %s",
618 			    __func__, imsg.hdr.type,
619 			    vm->vm_params.vmc_params.vcp_name);
620 		}
621 		imsg_free(&imsg);
622 	}
623 	imsg_event_add(iev);
624 }
625 
626 /*
627  * vm_shutdown
628  *
629  * Tell the vmm parent process to shutdown or reboot the VM and exit.
630  */
631 __dead void
632 vm_shutdown(unsigned int cmd)
633 {
634 	switch (cmd) {
635 	case VMMCI_NONE:
636 	case VMMCI_SHUTDOWN:
637 		(void)imsg_compose_event(&current_vm->vm_iev,
638 		    IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0);
639 		break;
640 	case VMMCI_REBOOT:
641 		(void)imsg_compose_event(&current_vm->vm_iev,
642 		    IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0);
643 		break;
644 	default:
645 		fatalx("invalid vm ctl command: %d", cmd);
646 	}
647 	imsg_flush(&current_vm->vm_iev.ibuf);
648 
649 	_exit(0);
650 }
651 
652 int
653 send_vm(int fd, struct vmd_vm *vm)
654 {
655 	struct vm_rwregs_params	   vrp;
656 	struct vm_rwvmparams_params vpp;
657 	struct vmop_create_params *vmc;
658 	struct vm_terminate_params vtp;
659 	unsigned int		   flags = 0;
660 	unsigned int		   i;
661 	int			   ret = 0;
662 	size_t			   sz;
663 
664 	if (dump_send_header(fd)) {
665 		log_warnx("%s: failed to send vm dump header", __func__);
666 		goto err;
667 	}
668 
669 	pause_vm(vm);
670 
671 	vmc = calloc(1, sizeof(struct vmop_create_params));
672 	if (vmc == NULL) {
673 		log_warn("%s: calloc error getting vmc", __func__);
674 		ret = -1;
675 		goto err;
676 	}
677 
678 	flags |= VMOP_CREATE_MEMORY;
679 	memcpy(&vmc->vmc_params, &current_vm->vm_params, sizeof(struct
680 	    vmop_create_params));
681 	vmc->vmc_flags = flags;
682 	vrp.vrwp_vm_id = vm->vm_params.vmc_params.vcp_id;
683 	vrp.vrwp_mask = VM_RWREGS_ALL;
684 	vpp.vpp_mask = VM_RWVMPARAMS_ALL;
685 	vpp.vpp_vm_id = vm->vm_params.vmc_params.vcp_id;
686 
687 	sz = atomicio(vwrite, fd, vmc, sizeof(struct vmop_create_params));
688 	if (sz != sizeof(struct vmop_create_params)) {
689 		ret = -1;
690 		goto err;
691 	}
692 
693 	for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) {
694 		vrp.vrwp_vcpu_id = i;
695 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) {
696 			log_warn("%s: readregs failed", __func__);
697 			goto err;
698 		}
699 
700 		sz = atomicio(vwrite, fd, &vrp,
701 		    sizeof(struct vm_rwregs_params));
702 		if (sz != sizeof(struct vm_rwregs_params)) {
703 			log_warn("%s: dumping registers failed", __func__);
704 			ret = -1;
705 			goto err;
706 		}
707 	}
708 
709 	/* Dump memory before devices to aid in restoration. */
710 	if ((ret = dump_mem(fd, vm)))
711 		goto err;
712 	if ((ret = i8253_dump(fd)))
713 		goto err;
714 	if ((ret = i8259_dump(fd)))
715 		goto err;
716 	if ((ret = ns8250_dump(fd)))
717 		goto err;
718 	if ((ret = mc146818_dump(fd)))
719 		goto err;
720 	if ((ret = fw_cfg_dump(fd)))
721 		goto err;
722 	if ((ret = pci_dump(fd)))
723 		goto err;
724 	if ((ret = virtio_dump(fd)))
725 		goto err;
726 
727 	for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) {
728 		vpp.vpp_vcpu_id = i;
729 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) {
730 			log_warn("%s: readvmparams failed", __func__);
731 			goto err;
732 		}
733 
734 		sz = atomicio(vwrite, fd, &vpp,
735 		    sizeof(struct vm_rwvmparams_params));
736 		if (sz != sizeof(struct vm_rwvmparams_params)) {
737 			log_warn("%s: dumping vm params failed", __func__);
738 			ret = -1;
739 			goto err;
740 		}
741 	}
742 
743 	vtp.vtp_vm_id = vm->vm_params.vmc_params.vcp_id;
744 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) {
745 		log_warnx("%s: term IOC error: %d, %d", __func__,
746 		    errno, ENOENT);
747 	}
748 err:
749 	close(fd);
750 	if (ret)
751 		unpause_vm(vm);
752 	return ret;
753 }
754 
755 int
756 dump_send_header(int fd) {
757 	struct vm_dump_header	   vmh;
758 	int			   i;
759 
760 	memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
761 	    sizeof(vmh.vmh_signature));
762 
763 	vmh.vmh_cpuids[0].code = 0x00;
764 	vmh.vmh_cpuids[0].leaf = 0x00;
765 
766 	vmh.vmh_cpuids[1].code = 0x01;
767 	vmh.vmh_cpuids[1].leaf = 0x00;
768 
769 	vmh.vmh_cpuids[2].code = 0x07;
770 	vmh.vmh_cpuids[2].leaf = 0x00;
771 
772 	vmh.vmh_cpuids[3].code = 0x0d;
773 	vmh.vmh_cpuids[3].leaf = 0x00;
774 
775 	vmh.vmh_cpuids[4].code = 0x80000001;
776 	vmh.vmh_cpuids[4].leaf = 0x00;
777 
778 	vmh.vmh_version = VM_DUMP_VERSION;
779 
780 	for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
781 		CPUID_LEAF(vmh.vmh_cpuids[i].code,
782 		    vmh.vmh_cpuids[i].leaf,
783 		    vmh.vmh_cpuids[i].a,
784 		    vmh.vmh_cpuids[i].b,
785 		    vmh.vmh_cpuids[i].c,
786 		    vmh.vmh_cpuids[i].d);
787 	}
788 
789 	if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
790 		return (-1);
791 
792 	return (0);
793 }
794 
795 int
796 dump_mem(int fd, struct vmd_vm *vm)
797 {
798 	unsigned int	i;
799 	int		ret;
800 	struct		vm_mem_range *vmr;
801 
802 	for (i = 0; i < vm->vm_params.vmc_params.vcp_nmemranges; i++) {
803 		vmr = &vm->vm_params.vmc_params.vcp_memranges[i];
804 		ret = dump_vmr(fd, vmr);
805 		if (ret)
806 			return ret;
807 	}
808 	return (0);
809 }
810 
811 int
812 restore_vm_params(int fd, struct vm_create_params *vcp) {
813 	unsigned int			i;
814 	struct vm_rwvmparams_params    vpp;
815 
816 	for (i = 0; i < vcp->vcp_ncpus; i++) {
817 		if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) {
818 			log_warn("%s: error restoring vm params", __func__);
819 			return (-1);
820 		}
821 		vpp.vpp_vm_id = vcp->vcp_id;
822 		vpp.vpp_vcpu_id = i;
823 		if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) {
824 			log_debug("%s: writing vm params failed", __func__);
825 			return (-1);
826 		}
827 	}
828 	return (0);
829 }
830 
831 void
832 restore_mem(int fd, struct vm_create_params *vcp)
833 {
834 	unsigned int	     i;
835 	struct vm_mem_range *vmr;
836 
837 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
838 		vmr = &vcp->vcp_memranges[i];
839 		restore_vmr(fd, vmr);
840 	}
841 }
842 
843 int
844 dump_vmr(int fd, struct vm_mem_range *vmr)
845 {
846 	size_t	rem = vmr->vmr_size, read=0;
847 	char	buf[PAGE_SIZE];
848 
849 	while (rem > 0) {
850 		if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) {
851 			log_warn("failed to read vmr");
852 			return (-1);
853 		}
854 		if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) {
855 			log_warn("failed to dump vmr");
856 			return (-1);
857 		}
858 		rem = rem - PAGE_SIZE;
859 		read = read + PAGE_SIZE;
860 	}
861 	return (0);
862 }
863 
864 void
865 restore_vmr(int fd, struct vm_mem_range *vmr)
866 {
867 	size_t	rem = vmr->vmr_size, wrote=0;
868 	char	buf[PAGE_SIZE];
869 
870 	while (rem > 0) {
871 		if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf))
872 			fatal("failed to restore vmr");
873 		if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE))
874 			fatal("failed to write vmr");
875 		rem = rem - PAGE_SIZE;
876 		wrote = wrote + PAGE_SIZE;
877 	}
878 }
879 
880 static void
881 pause_vm(struct vmd_vm *vm)
882 {
883 	unsigned int n;
884 	int ret;
885 	if (vm->vm_state & VM_STATE_PAUSED)
886 		return;
887 
888 	current_vm->vm_state |= VM_STATE_PAUSED;
889 
890 	ret = pthread_barrier_init(&vm_pause_barrier, NULL,
891 	    vm->vm_params.vmc_params.vcp_ncpus + 1);
892 	if (ret) {
893 		log_warnx("%s: cannot initialize pause barrier (%d)",
894 		    __progname, ret);
895 		return;
896 	}
897 
898 	for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) {
899 		ret = pthread_cond_broadcast(&vcpu_run_cond[n]);
900 		if (ret) {
901 			log_warnx("%s: can't broadcast vcpu run cond (%d)",
902 			    __func__, (int)ret);
903 			return;
904 		}
905 	}
906 	ret = pthread_barrier_wait(&vm_pause_barrier);
907 	if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
908 		log_warnx("%s: could not wait on pause barrier (%d)",
909 		    __func__, (int)ret);
910 		return;
911 	}
912 
913 	ret = pthread_barrier_destroy(&vm_pause_barrier);
914 	if (ret) {
915 		log_warnx("%s: could not destroy pause barrier (%d)",
916 		    __progname, ret);
917 		return;
918 	}
919 
920 	i8253_stop();
921 	mc146818_stop();
922 	ns8250_stop();
923 	virtio_stop(vm);
924 }
925 
926 static void
927 unpause_vm(struct vmd_vm *vm)
928 {
929 	unsigned int n;
930 	int ret;
931 	if (!(vm->vm_state & VM_STATE_PAUSED))
932 		return;
933 
934 	current_vm->vm_state &= ~VM_STATE_PAUSED;
935 	for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) {
936 		ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]);
937 		if (ret) {
938 			log_warnx("%s: can't broadcast vcpu unpause cond (%d)",
939 			    __func__, (int)ret);
940 			return;
941 		}
942 	}
943 
944 	i8253_start();
945 	mc146818_start();
946 	ns8250_start();
947 	virtio_start(vm);
948 }
949 
950 /*
951  * vcpu_reset
952  *
953  * Requests vmm(4) to reset the VCPUs in the indicated VM to
954  * the register state provided
955  *
956  * Parameters
957  *  vmid: VM ID to reset
958  *  vcpu_id: VCPU ID to reset
959  *  vrs: the register state to initialize
960  *
961  * Return values:
962  *  0: success
963  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
964  *      valid)
965  */
966 int
967 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
968 {
969 	struct vm_resetcpu_params vrp;
970 
971 	memset(&vrp, 0, sizeof(vrp));
972 	vrp.vrp_vm_id = vmid;
973 	vrp.vrp_vcpu_id = vcpu_id;
974 	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
975 
976 	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
977 
978 	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1)
979 		return (errno);
980 
981 	return (0);
982 }
983 
984 /*
985  * create_memory_map
986  *
987  * Sets up the guest physical memory ranges that the VM can access.
988  *
989  * Parameters:
990  *  vcp: VM create parameters describing the VM whose memory map
991  *       is being created
992  *
993  * Return values:
994  *  nothing
995  */
996 void
997 create_memory_map(struct vm_create_params *vcp)
998 {
999 	size_t len, mem_bytes;
1000 	size_t above_1m = 0, above_4g = 0;
1001 
1002 	mem_bytes = vcp->vcp_memranges[0].vmr_size;
1003 	vcp->vcp_nmemranges = 0;
1004 	if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE)
1005 		return;
1006 
1007 	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
1008 	len = LOWMEM_KB * 1024;
1009 	vcp->vcp_memranges[0].vmr_gpa = 0x0;
1010 	vcp->vcp_memranges[0].vmr_size = len;
1011 	vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM;
1012 	mem_bytes -= len;
1013 
1014 	/*
1015 	 * Second memory region: LOWMEM_KB - 1MB.
1016 	 *
1017 	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
1018 	 * We have to add this region, because some systems
1019 	 * unconditionally write to 0xb8000 (VGA RAM), and
1020 	 * we need to make sure that vmm(4) permits accesses
1021 	 * to it. So allocate guest memory for it.
1022 	 */
1023 	len = MB(1) - (LOWMEM_KB * 1024);
1024 	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
1025 	vcp->vcp_memranges[1].vmr_size = len;
1026 	vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED;
1027 	mem_bytes -= len;
1028 
1029 	/* If we have less than 2MB remaining, still create a 2nd BIOS area. */
1030 	if (mem_bytes <= MB(2)) {
1031 		vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END;
1032 		vcp->vcp_memranges[2].vmr_size = MB(2);
1033 		vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED;
1034 		vcp->vcp_nmemranges = 3;
1035 		return;
1036 	}
1037 
1038 	/*
1039 	 * Calculate the how to split any remaining memory across the 4GB
1040 	 * boundary while making sure we do not place physical memory into
1041 	 * MMIO ranges.
1042 	 */
1043 	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) {
1044 		above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1);
1045 		above_4g = mem_bytes - above_1m;
1046 	} else {
1047 		above_1m = mem_bytes;
1048 		above_4g = 0;
1049 	}
1050 
1051 	/* Third memory region: area above 1MB to MMIO region */
1052 	vcp->vcp_memranges[2].vmr_gpa = MB(1);
1053 	vcp->vcp_memranges[2].vmr_size = above_1m;
1054 	vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM;
1055 
1056 	/* Fourth region: PCI MMIO range */
1057 	vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_BASE;
1058 	vcp->vcp_memranges[3].vmr_size = VMM_PCI_MMIO_BAR_END -
1059 	    VMM_PCI_MMIO_BAR_BASE + 1;
1060 	vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO;
1061 
1062 	/* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */
1063 	vcp->vcp_memranges[4].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
1064 	vcp->vcp_memranges[4].vmr_size = MB(2);
1065 	vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED;
1066 
1067 	/* Sixth region: any remainder above 4GB */
1068 	if (above_4g > 0) {
1069 		vcp->vcp_memranges[5].vmr_gpa = GB(4);
1070 		vcp->vcp_memranges[5].vmr_size = above_4g;
1071 		vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM;
1072 		vcp->vcp_nmemranges = 6;
1073 	} else
1074 		vcp->vcp_nmemranges = 5;
1075 }
1076 
1077 /*
1078  * alloc_guest_mem
1079  *
1080  * Allocates memory for the guest.
1081  * Instead of doing a single allocation with one mmap(), we allocate memory
1082  * separately for every range for the following reasons:
1083  * - ASLR for the individual ranges
1084  * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
1085  *   map the single mmap'd userspace memory to the individual guest physical
1086  *   memory ranges, the underlying amap of the single mmap'd range would have
1087  *   to allocate per-page reference counters. The reason is that the
1088  *   individual guest physical ranges would reference the single mmap'd region
1089  *   only partially. However, if every guest physical range has its own
1090  *   corresponding mmap'd userspace allocation, there are no partial
1091  *   references: every guest physical range fully references an mmap'd
1092  *   range => no per-page reference counters have to be allocated.
1093  *
1094  * Return values:
1095  *  0: success
1096  *  !0: failure - errno indicating the source of the failure
1097  */
1098 int
1099 alloc_guest_mem(struct vmd_vm *vm)
1100 {
1101 	void *p;
1102 	char *tmp;
1103 	int fd, ret = 0;
1104 	size_t i, j;
1105 	struct vm_create_params *vcp = &vm->vm_params.vmc_params;
1106 	struct vm_mem_range *vmr;
1107 
1108 	tmp = calloc(32, sizeof(char));
1109 	if (tmp == NULL) {
1110 		ret = errno;
1111 		log_warn("%s: calloc", __func__);
1112 		return (ret);
1113 	}
1114 	strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32);
1115 
1116 	vm->vm_nmemfds = vcp->vcp_nmemranges;
1117 
1118 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1119 		vmr = &vcp->vcp_memranges[i];
1120 
1121 		fd = shm_mkstemp(tmp);
1122 		if (fd < 0) {
1123 			ret = errno;
1124 			log_warn("%s: shm_mkstemp", __func__);
1125 			return (ret);
1126 		}
1127 		if (ftruncate(fd, vmr->vmr_size) == -1) {
1128 			ret = errno;
1129 			log_warn("%s: ftruncate", __func__);
1130 			goto out;
1131 		}
1132 		if (fcntl(fd, F_SETFD, 0) == -1) {
1133 			ret = errno;
1134 			log_warn("%s: fcntl", __func__);
1135 			goto out;
1136 		}
1137 		if (shm_unlink(tmp) == -1) {
1138 			ret = errno;
1139 			log_warn("%s: shm_unlink", __func__);
1140 			goto out;
1141 		}
1142 		strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32);
1143 
1144 		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
1145 		    MAP_SHARED | MAP_CONCEAL, fd, 0);
1146 		if (p == MAP_FAILED) {
1147 			ret = errno;
1148 			for (j = 0; j < i; j++) {
1149 				vmr = &vcp->vcp_memranges[j];
1150 				munmap((void *)vmr->vmr_va, vmr->vmr_size);
1151 			}
1152 			goto out;
1153 		}
1154 		vm->vm_memfds[i] = fd;
1155 		vmr->vmr_va = (vaddr_t)p;
1156 	}
1157 out:
1158 	free(tmp);
1159 	return (ret);
1160 }
1161 
1162 /*
1163  * vmm_create_vm
1164  *
1165  * Requests vmm(4) to create a new VM using the supplied creation
1166  * parameters. This operation results in the creation of the in-kernel
1167  * structures for the VM, but does not start the VM's vcpu(s).
1168  *
1169  * Parameters:
1170  *  vm: pointer to the vm object
1171  *
1172  * Return values:
1173  *  0: success
1174  *  !0 : ioctl to vmm(4) failed
1175  */
1176 static int
1177 vmm_create_vm(struct vmd_vm *vm)
1178 {
1179 	struct vm_create_params *vcp = &vm->vm_params.vmc_params;
1180 
1181 	/* Sanity check arguments */
1182 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
1183 		return (EINVAL);
1184 
1185 	if (vcp->vcp_nmemranges == 0 ||
1186 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1187 		return (EINVAL);
1188 
1189 	if (vm->vm_params.vmc_ndisks > VM_MAX_DISKS_PER_VM)
1190 		return (EINVAL);
1191 
1192 	if (vm->vm_params.vmc_nnics > VM_MAX_NICS_PER_VM)
1193 		return (EINVAL);
1194 
1195 	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1)
1196 		return (errno);
1197 
1198 	return (0);
1199 }
1200 
1201 /*
1202  * init_emulated_hw
1203  *
1204  * Initializes the userspace hardware emulation
1205  */
1206 void
1207 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
1208     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1209 {
1210 	struct vm_create_params *vcp = &vmc->vmc_params;
1211 	size_t i;
1212 	uint64_t memlo, memhi;
1213 
1214 	/* Calculate memory size for NVRAM registers */
1215 	memlo = memhi = 0;
1216 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1217 		if (vcp->vcp_memranges[i].vmr_gpa == MB(1) &&
1218 		    vcp->vcp_memranges[i].vmr_size > (15 * MB(1)))
1219 			memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1));
1220 		else if (vcp->vcp_memranges[i].vmr_gpa == GB(4))
1221 			memhi = vcp->vcp_memranges[i].vmr_size;
1222 	}
1223 
1224 	/* Reset the IO port map */
1225 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1226 
1227 	/* Init i8253 PIT */
1228 	i8253_init(vcp->vcp_id);
1229 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1230 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1231 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1232 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1233 	ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
1234 
1235 	/* Init mc146818 RTC */
1236 	mc146818_init(vcp->vcp_id, memlo, memhi);
1237 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1238 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1239 
1240 	/* Init master and slave PICs */
1241 	i8259_init();
1242 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1243 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1244 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1245 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1246 	ioports_map[ELCR0] = vcpu_exit_elcr;
1247 	ioports_map[ELCR1] = vcpu_exit_elcr;
1248 
1249 	/* Init ns8250 UART */
1250 	ns8250_init(con_fd, vcp->vcp_id);
1251 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1252 		ioports_map[i] = vcpu_exit_com;
1253 
1254 	/* Initialize PCI */
1255 	for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
1256 		ioports_map[i] = vcpu_exit_pci;
1257 
1258 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1259 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1260 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1261 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1262 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1263 	pci_init();
1264 
1265 	/* Initialize virtio devices */
1266 	virtio_init(current_vm, child_cdrom, child_disks, child_taps);
1267 
1268 	/*
1269 	 * Init QEMU fw_cfg interface. Must be done last for pci hardware
1270 	 * detection.
1271 	 */
1272 	fw_cfg_init(vmc);
1273 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1274 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1275 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1276 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1277 }
1278 
1279 /*
1280  * restore_emulated_hw
1281  *
1282  * Restores the userspace hardware emulation from fd
1283  */
1284 void
1285 restore_emulated_hw(struct vm_create_params *vcp, int fd,
1286     int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
1287 {
1288 	/* struct vm_create_params *vcp = &vmc->vmc_params; */
1289 	int i;
1290 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1291 
1292 	/* Init i8253 PIT */
1293 	i8253_restore(fd, vcp->vcp_id);
1294 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1295 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1296 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1297 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1298 
1299 	/* Init master and slave PICs */
1300 	i8259_restore(fd);
1301 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1302 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1303 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1304 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1305 
1306 	/* Init ns8250 UART */
1307 	ns8250_restore(fd, con_fd, vcp->vcp_id);
1308 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1309 		ioports_map[i] = vcpu_exit_com;
1310 
1311 	/* Init mc146818 RTC */
1312 	mc146818_restore(fd, vcp->vcp_id);
1313 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1314 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1315 
1316 	/* Init QEMU fw_cfg interface */
1317 	fw_cfg_restore(fd);
1318 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1319 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1320 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1321 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1322 
1323 	/* Initialize PCI */
1324 	for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
1325 		ioports_map[i] = vcpu_exit_pci;
1326 
1327 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1328 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1329 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1330 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1331 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1332 	pci_restore(fd);
1333 	virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
1334 }
1335 
1336 /*
1337  * run_vm
1338  *
1339  * Runs the VM whose creation parameters are specified in vcp
1340  *
1341  * Parameters:
1342  *  child_cdrom: previously-opened child ISO disk file descriptor
1343  *  child_disks: previously-opened child VM disk file file descriptors
1344  *  child_taps: previously-opened child tap file descriptors
1345  *  vmc: vmop_create_params struct containing the VM's desired creation
1346  *      configuration
1347  *  vrs: VCPU register state to initialize
1348  *
1349  * Return values:
1350  *  0: the VM exited normally
1351  *  !0 : the VM exited abnormally or failed to start
1352  */
1353 static int
1354 run_vm(struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
1355 {
1356 	struct vm_create_params *vcp = &vmc->vmc_params;
1357 	struct vm_rwregs_params vregsp;
1358 	uint8_t evdone = 0;
1359 	size_t i;
1360 	int ret;
1361 	pthread_t *tid, evtid;
1362 	char tname[MAXCOMLEN + 1];
1363 	struct vm_run_params **vrp;
1364 	void *exit_status;
1365 
1366 	if (vcp == NULL)
1367 		return (EINVAL);
1368 
1369 	if (vcp->vcp_nmemranges == 0 ||
1370 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1371 		return (EINVAL);
1372 
1373 	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
1374 	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
1375 	if (tid == NULL || vrp == NULL) {
1376 		log_warn("%s: memory allocation error - exiting.",
1377 		    __progname);
1378 		return (ENOMEM);
1379 	}
1380 
1381 	log_debug("%s: starting %zu vcpu thread(s) for vm %s", __func__,
1382 	    vcp->vcp_ncpus, vcp->vcp_name);
1383 
1384 	/*
1385 	 * Create and launch one thread for each VCPU. These threads may
1386 	 * migrate between PCPUs over time; the need to reload CPU state
1387 	 * in such situations is detected and performed by vmm(4) in the
1388 	 * kernel.
1389 	 */
1390 	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
1391 		vrp[i] = malloc(sizeof(struct vm_run_params));
1392 		if (vrp[i] == NULL) {
1393 			log_warn("%s: memory allocation error - "
1394 			    "exiting.", __progname);
1395 			/* caller will exit, so skip freeing */
1396 			return (ENOMEM);
1397 		}
1398 		vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit));
1399 		if (vrp[i]->vrp_exit == NULL) {
1400 			log_warn("%s: memory allocation error - "
1401 			    "exiting.", __progname);
1402 			/* caller will exit, so skip freeing */
1403 			return (ENOMEM);
1404 		}
1405 		vrp[i]->vrp_vm_id = vcp->vcp_id;
1406 		vrp[i]->vrp_vcpu_id = i;
1407 
1408 		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
1409 			log_warnx("%s: cannot reset VCPU %zu - exiting.",
1410 			    __progname, i);
1411 			return (EIO);
1412 		}
1413 
1414 		/* once more because reset_cpu changes regs */
1415 		if (current_vm->vm_state & VM_STATE_RECEIVED) {
1416 			vregsp.vrwp_vm_id = vcp->vcp_id;
1417 			vregsp.vrwp_vcpu_id = i;
1418 			vregsp.vrwp_regs = *vrs;
1419 			vregsp.vrwp_mask = VM_RWREGS_ALL;
1420 			if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS,
1421 			    &vregsp)) == -1) {
1422 				log_warn("%s: writeregs failed", __func__);
1423 				return (ret);
1424 			}
1425 		}
1426 
1427 		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
1428 		if (ret) {
1429 			log_warnx("%s: cannot initialize cond var (%d)",
1430 			    __progname, ret);
1431 			return (ret);
1432 		}
1433 
1434 		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
1435 		if (ret) {
1436 			log_warnx("%s: cannot initialize mtx (%d)",
1437 			    __progname, ret);
1438 			return (ret);
1439 		}
1440 
1441 		ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL);
1442 		if (ret) {
1443 			log_warnx("%s: cannot initialize unpause var (%d)",
1444 			    __progname, ret);
1445 			return (ret);
1446 		}
1447 
1448 		ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL);
1449 		if (ret) {
1450 			log_warnx("%s: cannot initialize unpause mtx (%d)",
1451 			    __progname, ret);
1452 			return (ret);
1453 		}
1454 
1455 		vcpu_hlt[i] = 0;
1456 
1457 		/* Start each VCPU run thread at vcpu_run_loop */
1458 		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
1459 		if (ret) {
1460 			/* caller will _exit after this return */
1461 			ret = errno;
1462 			log_warn("%s: could not create vcpu thread %zu",
1463 			    __func__, i);
1464 			return (ret);
1465 		}
1466 
1467 		snprintf(tname, sizeof(tname), "vcpu-%zu", i);
1468 		pthread_set_name_np(tid[i], tname);
1469 	}
1470 
1471 	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
1472 	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
1473 	if (ret) {
1474 		errno = ret;
1475 		log_warn("%s: could not create event thread", __func__);
1476 		return (ret);
1477 	}
1478 	pthread_set_name_np(evtid, "event");
1479 
1480 	for (;;) {
1481 		ret = pthread_cond_wait(&threadcond, &threadmutex);
1482 		if (ret) {
1483 			log_warn("%s: waiting on thread state condition "
1484 			    "variable failed", __func__);
1485 			return (ret);
1486 		}
1487 
1488 		/*
1489 		 * Did a VCPU thread exit with an error? => return the first one
1490 		 */
1491 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1492 			if (vcpu_done[i] == 0)
1493 				continue;
1494 
1495 			if (pthread_join(tid[i], &exit_status)) {
1496 				log_warn("%s: failed to join thread %zd - "
1497 				    "exiting", __progname, i);
1498 				return (EIO);
1499 			}
1500 
1501 			ret = (intptr_t)exit_status;
1502 		}
1503 
1504 		/* Did the event thread exit? => return with an error */
1505 		if (evdone) {
1506 			if (pthread_join(evtid, &exit_status)) {
1507 				log_warn("%s: failed to join event thread - "
1508 				    "exiting", __progname);
1509 				return (EIO);
1510 			}
1511 
1512 			log_warnx("%s: vm %d event thread exited "
1513 			    "unexpectedly", __progname, vcp->vcp_id);
1514 			return (EIO);
1515 		}
1516 
1517 		/* Did all VCPU threads exit successfully? => return */
1518 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1519 			if (vcpu_done[i] == 0)
1520 				break;
1521 		}
1522 		if (i == vcp->vcp_ncpus)
1523 			return (ret);
1524 
1525 		/* Some more threads to wait for, start over */
1526 	}
1527 
1528 	return (ret);
1529 }
1530 
1531 void *
1532 event_thread(void *arg)
1533 {
1534 	uint8_t *donep = arg;
1535 	intptr_t ret;
1536 
1537 	ret = event_dispatch();
1538 
1539 	mutex_lock(&threadmutex);
1540 	*donep = 1;
1541 	pthread_cond_signal(&threadcond);
1542 	mutex_unlock(&threadmutex);
1543 
1544 	return (void *)ret;
1545  }
1546 
1547 /*
1548  * vcpu_run_loop
1549  *
1550  * Runs a single VCPU until vmm(4) requires help handling an exit,
1551  * or the VM terminates.
1552  *
1553  * Parameters:
1554  *  arg: vcpu_run_params for the VCPU being run by this thread
1555  *
1556  * Return values:
1557  *  NULL: the VCPU shutdown properly
1558  *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1559  */
1560 void *
1561 vcpu_run_loop(void *arg)
1562 {
1563 	struct vm_run_params *vrp = (struct vm_run_params *)arg;
1564 	intptr_t ret = 0;
1565 	int irq;
1566 	uint32_t n;
1567 
1568 	vrp->vrp_continue = 0;
1569 	n = vrp->vrp_vcpu_id;
1570 
1571 	for (;;) {
1572 		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1573 
1574 		if (ret) {
1575 			log_warnx("%s: can't lock vcpu run mtx (%d)",
1576 			    __func__, (int)ret);
1577 			return ((void *)ret);
1578 		}
1579 
1580 		/* If we are halted and need to pause, pause */
1581 		if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) {
1582 			ret = pthread_barrier_wait(&vm_pause_barrier);
1583 			if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
1584 				log_warnx("%s: could not wait on pause barrier (%d)",
1585 				    __func__, (int)ret);
1586 				return ((void *)ret);
1587 			}
1588 
1589 			ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]);
1590 			if (ret) {
1591 				log_warnx("%s: can't lock vcpu unpause mtx (%d)",
1592 				    __func__, (int)ret);
1593 				return ((void *)ret);
1594 			}
1595 
1596 			ret = pthread_cond_wait(&vcpu_unpause_cond[n],
1597 			    &vcpu_unpause_mtx[n]);
1598 			if (ret) {
1599 				log_warnx(
1600 				    "%s: can't wait on unpause cond (%d)",
1601 				    __func__, (int)ret);
1602 				break;
1603 			}
1604 			ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]);
1605 			if (ret) {
1606 				log_warnx("%s: can't unlock unpause mtx (%d)",
1607 				    __func__, (int)ret);
1608 				break;
1609 			}
1610 		}
1611 
1612 		/* If we are halted and not paused, wait */
1613 		if (vcpu_hlt[n]) {
1614 			ret = pthread_cond_wait(&vcpu_run_cond[n],
1615 			    &vcpu_run_mtx[n]);
1616 
1617 			if (ret) {
1618 				log_warnx(
1619 				    "%s: can't wait on cond (%d)",
1620 				    __func__, (int)ret);
1621 				(void)pthread_mutex_unlock(
1622 				    &vcpu_run_mtx[n]);
1623 				break;
1624 			}
1625 		}
1626 
1627 		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1628 
1629 		if (ret) {
1630 			log_warnx("%s: can't unlock mutex on cond (%d)",
1631 			    __func__, (int)ret);
1632 			break;
1633 		}
1634 
1635 		if (vrp->vrp_irqready && i8259_is_pending()) {
1636 			irq = i8259_ack();
1637 			vrp->vrp_irq = irq;
1638 		} else
1639 			vrp->vrp_irq = 0xFFFF;
1640 
1641 		/* Still more pending? */
1642 		if (i8259_is_pending()) {
1643 			/*
1644 			 * XXX can probably avoid ioctls here by providing intr
1645 			 * in vrp
1646 			 */
1647 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1648 			    vrp->vrp_vcpu_id, 1)) {
1649 				fatal("can't set INTR");
1650 			}
1651 		} else {
1652 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1653 			    vrp->vrp_vcpu_id, 0)) {
1654 				fatal("can't clear INTR");
1655 			}
1656 		}
1657 
1658 		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) {
1659 			/* If run ioctl failed, exit */
1660 			ret = errno;
1661 			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1662 			    __func__, vrp->vrp_vm_id, n);
1663 			break;
1664 		}
1665 
1666 		/* If the VM is terminating, exit normally */
1667 		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
1668 			ret = (intptr_t)NULL;
1669 			break;
1670 		}
1671 
1672 		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
1673 			/*
1674 			 * vmm(4) needs help handling an exit, handle in
1675 			 * vcpu_exit.
1676 			 */
1677 			ret = vcpu_exit(vrp);
1678 			if (ret)
1679 				break;
1680 		}
1681 	}
1682 
1683 	mutex_lock(&threadmutex);
1684 	vcpu_done[n] = 1;
1685 	pthread_cond_signal(&threadcond);
1686 	mutex_unlock(&threadmutex);
1687 
1688 	return ((void *)ret);
1689 }
1690 
1691 int
1692 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1693 {
1694 	struct vm_intr_params vip;
1695 
1696 	memset(&vip, 0, sizeof(vip));
1697 
1698 	vip.vip_vm_id = vm_id;
1699 	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1700 	vip.vip_intr = intr;
1701 
1702 	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1)
1703 		return (errno);
1704 
1705 	return (0);
1706 }
1707 
1708 /*
1709  * vcpu_exit_pci
1710  *
1711  * Handle all I/O to the emulated PCI subsystem.
1712  *
1713  * Parameters:
1714  *  vrp: vcpu run parameters containing guest state for this exit
1715  *
1716  * Return value:
1717  *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
1718  *      be injected.
1719  */
1720 uint8_t
1721 vcpu_exit_pci(struct vm_run_params *vrp)
1722 {
1723 	struct vm_exit *vei = vrp->vrp_exit;
1724 	uint8_t intr;
1725 
1726 	intr = 0xFF;
1727 
1728 	switch (vei->vei.vei_port) {
1729 	case PCI_MODE1_ADDRESS_REG:
1730 		pci_handle_address_reg(vrp);
1731 		break;
1732 	case PCI_MODE1_DATA_REG:
1733 	case PCI_MODE1_DATA_REG + 1:
1734 	case PCI_MODE1_DATA_REG + 2:
1735 	case PCI_MODE1_DATA_REG + 3:
1736 		pci_handle_data_reg(vrp);
1737 		break;
1738 	case VM_PCI_IO_BAR_BASE ... VM_PCI_IO_BAR_END:
1739 		intr = pci_handle_io(vrp);
1740 		break;
1741 	default:
1742 		log_warnx("%s: unknown PCI register 0x%llx",
1743 		    __progname, (uint64_t)vei->vei.vei_port);
1744 		break;
1745 	}
1746 
1747 	return (intr);
1748 }
1749 
1750 /*
1751  * vcpu_exit_inout
1752  *
1753  * Handle all I/O exits that need to be emulated in vmd. This includes the
1754  * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
1755  *
1756  * Parameters:
1757  *  vrp: vcpu run parameters containing guest state for this exit
1758  */
1759 void
1760 vcpu_exit_inout(struct vm_run_params *vrp)
1761 {
1762 	struct vm_exit *vei = vrp->vrp_exit;
1763 	uint8_t intr = 0xFF;
1764 
1765 	if (vei->vei.vei_rep || vei->vei.vei_string) {
1766 #ifdef MMIO_DEBUG
1767 		log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x",
1768 		    __func__,
1769 		    vei->vei.vei_rep == 0 ? "" : "REP ",
1770 		    vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT",
1771 		    vei->vei.vei_string == 0 ? "" : "S",
1772 		    vei->vei.vei_size, vei->vei.vei_encoding,
1773 		    vei->vei.vei_data, vei->vei.vei_port);
1774 		log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx",
1775 		    __func__,
1776 		    vei->vrs.vrs_gprs[VCPU_REGS_RCX],
1777 		    vei->vrs.vrs_gprs[VCPU_REGS_RDX],
1778 		    vei->vrs.vrs_gprs[VCPU_REGS_RSI]);
1779 #endif /* MMIO_DEBUG */
1780 		fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)",
1781 		    __func__);
1782 	}
1783 
1784 	if (ioports_map[vei->vei.vei_port] != NULL)
1785 		intr = ioports_map[vei->vei.vei_port](vrp);
1786 	else if (vei->vei.vei_dir == VEI_DIR_IN)
1787 		set_return_data(vei, 0xFFFFFFFF);
1788 
1789 	if (intr != 0xFF)
1790 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
1791 }
1792 
1793 /*
1794  * vcpu_exit_eptviolation
1795  *
1796  * handle an EPT Violation
1797  *
1798  * Parameters:
1799  *  vrp: vcpu run parameters containing guest state for this exit
1800  *
1801  * Return values:
1802  *  0: no action required
1803  *  EFAULT: a protection fault occured, kill the vm.
1804  */
1805 int
1806 vcpu_exit_eptviolation(struct vm_run_params *vrp)
1807 {
1808 	struct vm_exit *ve = vrp->vrp_exit;
1809 	int ret = 0;
1810 #if MMIO_NOTYET
1811 	struct x86_insn insn;
1812 	uint64_t va, pa;
1813 	size_t len = 15;		/* Max instruction length in x86. */
1814 #endif /* MMIO_NOTYET */
1815 	switch (ve->vee.vee_fault_type) {
1816 	case VEE_FAULT_HANDLED:
1817 		log_debug("%s: fault already handled", __func__);
1818 		break;
1819 
1820 #if MMIO_NOTYET
1821 	case VEE_FAULT_MMIO_ASSIST:
1822 		/* Intel VMX might give us the length of the instruction. */
1823 		if (ve->vee.vee_insn_info & VEE_LEN_VALID)
1824 			len = ve->vee.vee_insn_len;
1825 
1826 		if (len > 15)
1827 			fatalx("%s: invalid instruction length %lu", __func__,
1828 			    len);
1829 
1830 		/* If we weren't given instruction bytes, we need to fetch. */
1831 		if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) {
1832 			memset(ve->vee.vee_insn_bytes, 0,
1833 			    sizeof(ve->vee.vee_insn_bytes));
1834 			va = ve->vrs.vrs_gprs[VCPU_REGS_RIP];
1835 
1836 			/* XXX Only support instructions that fit on 1 page. */
1837 			if ((va & PAGE_MASK) + len > PAGE_SIZE) {
1838 				log_warnx("%s: instruction might cross page "
1839 				    "boundary", __func__);
1840 				ret = EINVAL;
1841 				break;
1842 			}
1843 
1844 			ret = translate_gva(ve, va, &pa, PROT_EXEC);
1845 			if (ret != 0) {
1846 				log_warnx("%s: failed gva translation",
1847 				    __func__);
1848 				break;
1849 			}
1850 
1851 			ret = read_mem(pa, ve->vee.vee_insn_bytes, len);
1852 			if (ret != 0) {
1853 				log_warnx("%s: failed to fetch instruction "
1854 				    "bytes from 0x%llx", __func__, pa);
1855 				break;
1856 			}
1857 		}
1858 
1859 		ret = insn_decode(ve, &insn);
1860 		if (ret == 0)
1861 			ret = insn_emulate(ve, &insn);
1862 		break;
1863 #endif /* MMIO_NOTYET */
1864 
1865 	case VEE_FAULT_PROTECT:
1866 		log_debug("%s: EPT Violation: rip=0x%llx", __progname,
1867 		    ve->vrs.vrs_gprs[VCPU_REGS_RIP]);
1868 		ret = EFAULT;
1869 		break;
1870 
1871 	default:
1872 		fatalx("%s: invalid fault_type %d", __progname,
1873 		    ve->vee.vee_fault_type);
1874 		/* UNREACHED */
1875 	}
1876 
1877 	return (ret);
1878 }
1879 
1880 /*
1881  * vcpu_exit
1882  *
1883  * Handle a vcpu exit. This function is called when it is determined that
1884  * vmm(4) requires the assistance of vmd to support a particular guest
1885  * exit type (eg, accessing an I/O port or device). Guest state is contained
1886  * in 'vrp', and will be resent to vmm(4) on exit completion.
1887  *
1888  * Upon conclusion of handling the exit, the function determines if any
1889  * interrupts should be injected into the guest, and asserts the proper
1890  * IRQ line whose interrupt should be vectored.
1891  *
1892  * Parameters:
1893  *  vrp: vcpu run parameters containing guest state for this exit
1894  *
1895  * Return values:
1896  *  0: the exit was handled successfully
1897  *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
1898  */
1899 int
1900 vcpu_exit(struct vm_run_params *vrp)
1901 {
1902 	int ret;
1903 
1904 	switch (vrp->vrp_exit_reason) {
1905 	case VMX_EXIT_INT_WINDOW:
1906 	case SVM_VMEXIT_VINTR:
1907 	case VMX_EXIT_CPUID:
1908 	case VMX_EXIT_EXTINT:
1909 	case SVM_VMEXIT_INTR:
1910 	case SVM_VMEXIT_MSR:
1911 	case SVM_VMEXIT_CPUID:
1912 		/*
1913 		 * We may be exiting to vmd to handle a pending interrupt but
1914 		 * at the same time the last exit type may have been one of
1915 		 * these. In this case, there's nothing extra to be done
1916 		 * here (and falling through to the default case below results
1917 		 * in more vmd log spam).
1918 		 */
1919 		break;
1920 	case SVM_VMEXIT_NPF:
1921 	case VMX_EXIT_EPT_VIOLATION:
1922 		ret = vcpu_exit_eptviolation(vrp);
1923 		if (ret)
1924 			return (ret);
1925 		break;
1926 	case VMX_EXIT_IO:
1927 	case SVM_VMEXIT_IOIO:
1928 		vcpu_exit_inout(vrp);
1929 		break;
1930 	case VMX_EXIT_HLT:
1931 	case SVM_VMEXIT_HLT:
1932 		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1933 		if (ret) {
1934 			log_warnx("%s: can't lock vcpu mutex (%d)",
1935 			    __func__, ret);
1936 			return (ret);
1937 		}
1938 		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
1939 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1940 		if (ret) {
1941 			log_warnx("%s: can't unlock vcpu mutex (%d)",
1942 			    __func__, ret);
1943 			return (ret);
1944 		}
1945 		break;
1946 	case VMX_EXIT_TRIPLE_FAULT:
1947 	case SVM_VMEXIT_SHUTDOWN:
1948 		/* reset VM */
1949 		return (EAGAIN);
1950 	default:
1951 		log_debug("%s: unknown exit reason 0x%x",
1952 		    __progname, vrp->vrp_exit_reason);
1953 	}
1954 
1955 	vrp->vrp_continue = 1;
1956 
1957 	return (0);
1958 }
1959 
1960 /*
1961  * find_gpa_range
1962  *
1963  * Search for a contiguous guest physical mem range.
1964  *
1965  * Parameters:
1966  *  vcp: VM create parameters that contain the memory map to search in
1967  *  gpa: the starting guest physical address
1968  *  len: the length of the memory range
1969  *
1970  * Return values:
1971  *  NULL: on failure if there is no memory range as described by the parameters
1972  *  Pointer to vm_mem_range that contains the start of the range otherwise.
1973  */
1974 static struct vm_mem_range *
1975 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
1976 {
1977 	size_t i, n;
1978 	struct vm_mem_range *vmr;
1979 
1980 	/* Find the first vm_mem_range that contains gpa */
1981 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1982 		vmr = &vcp->vcp_memranges[i];
1983 		if (gpa < vmr->vmr_gpa + vmr->vmr_size)
1984 			break;
1985 	}
1986 
1987 	/* No range found. */
1988 	if (i == vcp->vcp_nmemranges)
1989 		return (NULL);
1990 
1991 	/*
1992 	 * vmr may cover the range [gpa, gpa + len) only partly. Make
1993 	 * sure that the following vm_mem_ranges are contiguous and
1994 	 * cover the rest.
1995 	 */
1996 	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
1997 	if (len < n)
1998 		len = 0;
1999 	else
2000 		len -= n;
2001 	gpa = vmr->vmr_gpa + vmr->vmr_size;
2002 	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
2003 		vmr = &vcp->vcp_memranges[i];
2004 		if (gpa != vmr->vmr_gpa)
2005 			return (NULL);
2006 		if (len <= vmr->vmr_size)
2007 			len = 0;
2008 		else
2009 			len -= vmr->vmr_size;
2010 
2011 		gpa = vmr->vmr_gpa + vmr->vmr_size;
2012 	}
2013 
2014 	if (len != 0)
2015 		return (NULL);
2016 
2017 	return (vmr);
2018 }
2019 
2020 /*
2021  * write_mem
2022  *
2023  * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
2024  *
2025  * Parameters:
2026  *  dst: the destination paddr_t in the guest VM
2027  *  buf: data to copy (or NULL to zero the data)
2028  *  len: number of bytes to copy
2029  *
2030  * Return values:
2031  *  0: success
2032  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
2033  *      exist in the guest.
2034  */
2035 int
2036 write_mem(paddr_t dst, const void *buf, size_t len)
2037 {
2038 	const char *from = buf;
2039 	char *to;
2040 	size_t n, off;
2041 	struct vm_mem_range *vmr;
2042 
2043 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
2044 	if (vmr == NULL) {
2045 		errno = EINVAL;
2046 		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
2047 		    "len = 0x%zx", __func__, dst, len);
2048 		return (EINVAL);
2049 	}
2050 
2051 	off = dst - vmr->vmr_gpa;
2052 	while (len != 0) {
2053 		n = vmr->vmr_size - off;
2054 		if (len < n)
2055 			n = len;
2056 
2057 		to = (char *)vmr->vmr_va + off;
2058 		if (buf == NULL)
2059 			memset(to, 0, n);
2060 		else {
2061 			memcpy(to, from, n);
2062 			from += n;
2063 		}
2064 		len -= n;
2065 		off = 0;
2066 		vmr++;
2067 	}
2068 
2069 	return (0);
2070 }
2071 
2072 /*
2073  * read_mem
2074  *
2075  * Reads memory at guest paddr 'src' into 'buf'.
2076  *
2077  * Parameters:
2078  *  src: the source paddr_t in the guest VM to read from.
2079  *  buf: destination (local) buffer
2080  *  len: number of bytes to read
2081  *
2082  * Return values:
2083  *  0: success
2084  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
2085  *      exist in the guest.
2086  */
2087 int
2088 read_mem(paddr_t src, void *buf, size_t len)
2089 {
2090 	char *from, *to = buf;
2091 	size_t n, off;
2092 	struct vm_mem_range *vmr;
2093 
2094 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
2095 	if (vmr == NULL) {
2096 		errno = EINVAL;
2097 		log_warn("%s: failed - invalid memory range src = 0x%lx, "
2098 		    "len = 0x%zx", __func__, src, len);
2099 		return (EINVAL);
2100 	}
2101 
2102 	off = src - vmr->vmr_gpa;
2103 	while (len != 0) {
2104 		n = vmr->vmr_size - off;
2105 		if (len < n)
2106 			n = len;
2107 
2108 		from = (char *)vmr->vmr_va + off;
2109 		memcpy(to, from, n);
2110 
2111 		to += n;
2112 		len -= n;
2113 		off = 0;
2114 		vmr++;
2115 	}
2116 
2117 	return (0);
2118 }
2119 
2120 /*
2121  * hvaddr_mem
2122  *
2123  * Translate a guest physical address to a host virtual address, checking the
2124  * provided memory range length to confirm it's contiguous within the same
2125  * guest memory range (vm_mem_range).
2126  *
2127  * Parameters:
2128  *  gpa: guest physical address to translate
2129  *  len: number of bytes in the intended range
2130  *
2131  * Return values:
2132  *  void* to host virtual memory on success
2133  *  NULL on error, setting errno to:
2134  *    EFAULT: gpa falls outside guest memory ranges
2135  *    EINVAL: requested len extends beyond memory range
2136  */
2137 void *
2138 hvaddr_mem(paddr_t gpa, size_t len)
2139 {
2140 	struct vm_mem_range *vmr;
2141 	size_t off;
2142 
2143 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, gpa, len);
2144 	if (vmr == NULL) {
2145 		log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa);
2146 		errno = EFAULT;
2147 		return (NULL);
2148 	}
2149 
2150 	off = gpa - vmr->vmr_gpa;
2151 	if (len > (vmr->vmr_size - off)) {
2152 		log_warnx("%s: failed - invalid memory range: gpa=0x%lx, "
2153 		    "len=%zu", __func__, gpa, len);
2154 		errno = EINVAL;
2155 		return (NULL);
2156 	}
2157 
2158 	return ((char *)vmr->vmr_va + off);
2159 }
2160 
2161 /*
2162  * vcpu_assert_pic_irq
2163  *
2164  * Injects the specified IRQ on the supplied vcpu/vm
2165  *
2166  * Parameters:
2167  *  vm_id: VM ID to inject to
2168  *  vcpu_id: VCPU ID to inject to
2169  *  irq: IRQ to inject
2170  */
2171 void
2172 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
2173 {
2174 	int ret;
2175 
2176 	i8259_assert_irq(irq);
2177 
2178 	if (i8259_is_pending()) {
2179 		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
2180 			fatalx("%s: can't assert INTR", __func__);
2181 
2182 		ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
2183 		if (ret)
2184 			fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
2185 
2186 		vcpu_hlt[vcpu_id] = 0;
2187 		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
2188 		if (ret)
2189 			fatalx("%s: can't signal (%d)", __func__, ret);
2190 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
2191 		if (ret)
2192 			fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
2193 	}
2194 }
2195 
2196 /*
2197  * vcpu_deassert_pic_irq
2198  *
2199  * Clears the specified IRQ on the supplied vcpu/vm
2200  *
2201  * Parameters:
2202  *  vm_id: VM ID to clear in
2203  *  vcpu_id: VCPU ID to clear in
2204  *  irq: IRQ to clear
2205  */
2206 void
2207 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
2208 {
2209 	i8259_deassert_irq(irq);
2210 
2211 	if (!i8259_is_pending()) {
2212 		if (vcpu_pic_intr(vm_id, vcpu_id, 0))
2213 			fatalx("%s: can't deassert INTR for vm_id %d, "
2214 			    "vcpu_id %d", __func__, vm_id, vcpu_id);
2215 	}
2216 }
2217 
2218 /*
2219  * fd_hasdata
2220  *
2221  * Determines if data can be read from a file descriptor.
2222  *
2223  * Parameters:
2224  *  fd: the fd to check
2225  *
2226  * Return values:
2227  *  1 if data can be read from an fd, or 0 otherwise.
2228  */
2229 int
2230 fd_hasdata(int fd)
2231 {
2232 	struct pollfd pfd[1];
2233 	int nready, hasdata = 0;
2234 
2235 	pfd[0].fd = fd;
2236 	pfd[0].events = POLLIN;
2237 	nready = poll(pfd, 1, 0);
2238 	if (nready == -1)
2239 		log_warn("checking file descriptor for data failed");
2240 	else if (nready == 1 && pfd[0].revents & POLLIN)
2241 		hasdata = 1;
2242 	return (hasdata);
2243 }
2244 
2245 /*
2246  * mutex_lock
2247  *
2248  * Wrapper function for pthread_mutex_lock that does error checking and that
2249  * exits on failure
2250  */
2251 void
2252 mutex_lock(pthread_mutex_t *m)
2253 {
2254 	int ret;
2255 
2256 	ret = pthread_mutex_lock(m);
2257 	if (ret) {
2258 		errno = ret;
2259 		fatal("could not acquire mutex");
2260 	}
2261 }
2262 
2263 /*
2264  * mutex_unlock
2265  *
2266  * Wrapper function for pthread_mutex_unlock that does error checking and that
2267  * exits on failure
2268  */
2269 void
2270 mutex_unlock(pthread_mutex_t *m)
2271 {
2272 	int ret;
2273 
2274 	ret = pthread_mutex_unlock(m);
2275 	if (ret) {
2276 		errno = ret;
2277 		fatal("could not release mutex");
2278 	}
2279 }
2280 
2281 /*
2282  * set_return_data
2283  *
2284  * Utility function for manipulating register data in vm exit info structs. This
2285  * function ensures that the data is copied to the vei->vei.vei_data field with
2286  * the proper size for the operation being performed.
2287  *
2288  * Parameters:
2289  *  vei: exit information
2290  *  data: return data
2291  */
2292 void
2293 set_return_data(struct vm_exit *vei, uint32_t data)
2294 {
2295 	switch (vei->vei.vei_size) {
2296 	case 1:
2297 		vei->vei.vei_data &= ~0xFF;
2298 		vei->vei.vei_data |= (uint8_t)data;
2299 		break;
2300 	case 2:
2301 		vei->vei.vei_data &= ~0xFFFF;
2302 		vei->vei.vei_data |= (uint16_t)data;
2303 		break;
2304 	case 4:
2305 		vei->vei.vei_data = data;
2306 		break;
2307 	}
2308 }
2309 
2310 /*
2311  * get_input_data
2312  *
2313  * Utility function for manipulating register data in vm exit info
2314  * structs. This function ensures that the data is copied from the
2315  * vei->vei.vei_data field with the proper size for the operation being
2316  * performed.
2317  *
2318  * Parameters:
2319  *  vei: exit information
2320  *  data: location to store the result
2321  */
2322 void
2323 get_input_data(struct vm_exit *vei, uint32_t *data)
2324 {
2325 	switch (vei->vei.vei_size) {
2326 	case 1:
2327 		*data &= 0xFFFFFF00;
2328 		*data |= (uint8_t)vei->vei.vei_data;
2329 		break;
2330 	case 2:
2331 		*data &= 0xFFFF0000;
2332 		*data |= (uint16_t)vei->vei.vei_data;
2333 		break;
2334 	case 4:
2335 		*data = vei->vei.vei_data;
2336 		break;
2337 	default:
2338 		log_warnx("%s: invalid i/o size %d", __func__,
2339 		    vei->vei.vei_size);
2340 	}
2341 
2342 }
2343 
2344 /*
2345  * translate_gva
2346  *
2347  * Translates a guest virtual address to a guest physical address by walking
2348  * the currently active page table (if needed).
2349  *
2350  * XXX ensure translate_gva updates the A bit in the PTE
2351  * XXX ensure translate_gva respects segment base and limits in i386 mode
2352  * XXX ensure translate_gva respects segment wraparound in i8086 mode
2353  * XXX ensure translate_gva updates the A bit in the segment selector
2354  * XXX ensure translate_gva respects CR4.LMSLE if available
2355  *
2356  * Parameters:
2357  *  exit: The VCPU this translation should be performed for (guest MMU settings
2358  *   are gathered from this VCPU)
2359  *  va: virtual address to translate
2360  *  pa: pointer to paddr_t variable that will receive the translated physical
2361  *   address. 'pa' is unchanged on error.
2362  *  mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
2363  *   the address should be translated
2364  *
2365  * Return values:
2366  *  0: the address was successfully translated - 'pa' contains the physical
2367  *     address currently mapped by 'va'.
2368  *  EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
2369  *     and %cr2 set in the vcpu structure.
2370  *  EINVAL: an error occurred reading paging table structures
2371  */
2372 int
2373 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
2374 {
2375 	int level, shift, pdidx;
2376 	uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
2377 	uint64_t shift_width, pte_size;
2378 	struct vcpu_reg_state *vrs;
2379 
2380 	vrs = &exit->vrs;
2381 
2382 	if (!pa)
2383 		return (EINVAL);
2384 
2385 	if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
2386 		log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
2387 		*pa = va;
2388 		return (0);
2389 	}
2390 
2391 	pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
2392 
2393 	log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
2394 	    vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
2395 
2396 	if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
2397 		if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
2398 			pte_size = sizeof(uint64_t);
2399 			shift_width = 9;
2400 
2401 			if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
2402 				/* 4 level paging */
2403 				level = 4;
2404 				mask = L4_MASK;
2405 				shift = L4_SHIFT;
2406 			} else {
2407 				/* 32 bit with PAE paging */
2408 				level = 3;
2409 				mask = L3_MASK;
2410 				shift = L3_SHIFT;
2411 			}
2412 		} else {
2413 			/* 32 bit paging */
2414 			level = 2;
2415 			shift_width = 10;
2416 			mask = 0xFFC00000;
2417 			shift = 22;
2418 			pte_size = sizeof(uint32_t);
2419 		}
2420 	} else
2421 		return (EINVAL);
2422 
2423 	/* XXX: Check for R bit in segment selector and set A bit */
2424 
2425 	for (;level > 0; level--) {
2426 		pdidx = (va & mask) >> shift;
2427 		pte_paddr = (pt_paddr) + (pdidx * pte_size);
2428 
2429 		log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
2430 		    level, pte_paddr);
2431 		if (read_mem(pte_paddr, &pte, pte_size)) {
2432 			log_warn("%s: failed to read pte", __func__);
2433 			return (EFAULT);
2434 		}
2435 
2436 		log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
2437 		    pte);
2438 
2439 		/* XXX: Set CR2  */
2440 		if (!(pte & PG_V))
2441 			return (EFAULT);
2442 
2443 		/* XXX: Check for SMAP */
2444 		if ((mode == PROT_WRITE) && !(pte & PG_RW))
2445 			return (EPERM);
2446 
2447 		if ((exit->cpl > 0) && !(pte & PG_u))
2448 			return (EPERM);
2449 
2450 		pte = pte | PG_U;
2451 		if (mode == PROT_WRITE)
2452 			pte = pte | PG_M;
2453 		if (write_mem(pte_paddr, &pte, pte_size)) {
2454 			log_warn("%s: failed to write back flags to pte",
2455 			    __func__);
2456 			return (EIO);
2457 		}
2458 
2459 		/* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */
2460 		if (pte & PG_PS)
2461 			break;
2462 
2463 		if (level > 1) {
2464 			pt_paddr = pte & PG_FRAME;
2465 			shift -= shift_width;
2466 			mask = mask >> shift_width;
2467 		}
2468 	}
2469 
2470 	low_mask = (1 << shift) - 1;
2471 	high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
2472 	*pa = (pte & high_mask) | (va & low_mask);
2473 
2474 	log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
2475 
2476 	return (0);
2477 }
2478 
2479 /*
2480  * vm_pipe_init
2481  *
2482  * Initialize a vm_dev_pipe, setting up its file descriptors and its
2483  * event structure with the given callback.
2484  *
2485  * Parameters:
2486  *  p: pointer to vm_dev_pipe struct to initizlize
2487  *  cb: callback to use for READ events on the read end of the pipe
2488  */
2489 void
2490 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *))
2491 {
2492 	int ret;
2493 	int fds[2];
2494 
2495 	memset(p, 0, sizeof(struct vm_dev_pipe));
2496 
2497 	ret = pipe(fds);
2498 	if (ret)
2499 		fatal("failed to create vm_dev_pipe pipe");
2500 
2501 	p->read = fds[0];
2502 	p->write = fds[1];
2503 
2504 	event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL);
2505 }
2506 
2507 /*
2508  * vm_pipe_send
2509  *
2510  * Send a message to an emulated device vie the provided vm_dev_pipe.
2511  *
2512  * Parameters:
2513  *  p: pointer to initialized vm_dev_pipe
2514  *  msg: message to send in the channel
2515  */
2516 void
2517 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg)
2518 {
2519 	size_t n;
2520 	n = write(p->write, &msg, sizeof(msg));
2521 	if (n != sizeof(msg))
2522 		fatal("failed to write to device pipe");
2523 }
2524 
2525 /*
2526  * vm_pipe_recv
2527  *
2528  * Receive a message for an emulated device via the provided vm_dev_pipe.
2529  * Returns the message value, otherwise will exit on failure.
2530  *
2531  * Parameters:
2532  *  p: pointer to initialized vm_dev_pipe
2533  *
2534  * Return values:
2535  *  a value of enum pipe_msg_type or fatal exit on read(2) error
2536  */
2537 enum pipe_msg_type
2538 vm_pipe_recv(struct vm_dev_pipe *p)
2539 {
2540 	size_t n;
2541 	enum pipe_msg_type msg;
2542 	n = read(p->read, &msg, sizeof(msg));
2543 	if (n != sizeof(msg))
2544 		fatal("failed to read from device pipe");
2545 
2546 	return msg;
2547 }
2548 
2549 /*
2550  * Re-map the guest address space using the shared memory file descriptor.
2551  *
2552  * Returns 0 on success, non-zero in event of failure.
2553  */
2554 int
2555 remap_guest_mem(struct vmd_vm *vm)
2556 {
2557 	struct vm_create_params	*vcp;
2558 	struct vm_mem_range	*vmr;
2559 	size_t			 i, j;
2560 	void			*p = NULL;
2561 	int			 ret;
2562 
2563 	if (vm == NULL)
2564 		return (1);
2565 
2566 	vcp = &vm->vm_params.vmc_params;
2567 
2568 	/*
2569 	 * We've execve'd, so we need to re-map the guest VM memory. Iterate
2570 	 * over all possible vm_mem_range entries so we can initialize all
2571 	 * file descriptors to a value.
2572 	 */
2573 	for (i = 0; i < VMM_MAX_MEM_RANGES; i++) {
2574 		if (i < vcp->vcp_nmemranges) {
2575 			vmr = &vcp->vcp_memranges[i];
2576 			/* Skip ranges we know we don't need right now. */
2577 			if (vmr->vmr_type == VM_MEM_MMIO) {
2578 				log_debug("%s: skipping range i=%ld, type=%d",
2579 				    __func__, i, vmr->vmr_type);
2580 				vm->vm_memfds[i] = -1;
2581 				continue;
2582 			}
2583 			/* Re-mmap the memrange. */
2584 			p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
2585 			    MAP_SHARED | MAP_CONCEAL, vm->vm_memfds[i], 0);
2586 			if (p == MAP_FAILED) {
2587 				ret = errno;
2588 				log_warn("%s: mmap", __func__);
2589 				for (j = 0; j < i; j++) {
2590 					vmr = &vcp->vcp_memranges[j];
2591 					munmap((void *)vmr->vmr_va,
2592 					    vmr->vmr_size);
2593 				}
2594 				return (ret);
2595 			}
2596 			vmr->vmr_va = (vaddr_t)p;
2597 		} else {
2598 			/* Initialize with an invalid fd. */
2599 			vm->vm_memfds[i] = -1;
2600 		}
2601 	}
2602 
2603 	return (0);
2604 }
2605