xref: /openbsd-src/usr.sbin/vmd/vm.c (revision 8550894424f8a4aa4aafb6cd57229dd6ed7cd9dd)
1 /*	$OpenBSD: vm.c,v 1.81 2023/01/08 19:57:17 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* PAGE_SIZE, MAXCOMLEN */
20 #include <sys/types.h>
21 #include <sys/ioctl.h>
22 #include <sys/queue.h>
23 #include <sys/wait.h>
24 #include <sys/uio.h>
25 #include <sys/stat.h>
26 #include <sys/socket.h>
27 #include <sys/time.h>
28 #include <sys/mman.h>
29 #include <sys/resource.h>
30 
31 #include <dev/ic/i8253reg.h>
32 #include <dev/isa/isareg.h>
33 #include <dev/pci/pcireg.h>
34 
35 #include <machine/psl.h>
36 #include <machine/pte.h>
37 #include <machine/specialreg.h>
38 #include <machine/vmmvar.h>
39 
40 #include <net/if.h>
41 
42 #include <errno.h>
43 #include <event.h>
44 #include <fcntl.h>
45 #include <imsg.h>
46 #include <limits.h>
47 #include <poll.h>
48 #include <pthread.h>
49 #include <pthread_np.h>
50 #include <stddef.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <unistd.h>
55 #include <util.h>
56 
57 #include "atomicio.h"
58 #include "fw_cfg.h"
59 #include "i8253.h"
60 #include "i8259.h"
61 #include "loadfile.h"
62 #include "mc146818.h"
63 #include "mmio.h"
64 #include "ns8250.h"
65 #include "pci.h"
66 #include "virtio.h"
67 #include "vmd.h"
68 #include "vmm.h"
69 
70 #define MB(x)	(x * 1024UL * 1024UL)
71 #define GB(x)	(x * 1024UL * 1024UL * 1024UL)
72 
73 #define MMIO_NOTYET 0
74 
75 io_fn_t ioports_map[MAX_PORTS];
76 
77 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
78     struct vmop_create_params *, struct vcpu_reg_state *);
79 void vm_dispatch_vmm(int, short, void *);
80 void *event_thread(void *);
81 void *vcpu_run_loop(void *);
82 int vcpu_exit(struct vm_run_params *);
83 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
84 void create_memory_map(struct vm_create_params *);
85 int alloc_guest_mem(struct vm_create_params *);
86 int vmm_create_vm(struct vm_create_params *);
87 void init_emulated_hw(struct vmop_create_params *, int,
88     int[][VM_MAX_BASE_PER_DISK], int *);
89 void restore_emulated_hw(struct vm_create_params *, int, int *,
90     int[][VM_MAX_BASE_PER_DISK],int);
91 void vcpu_exit_inout(struct vm_run_params *);
92 int vcpu_exit_eptviolation(struct vm_run_params *);
93 uint8_t vcpu_exit_pci(struct vm_run_params *);
94 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
95 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
96 int send_vm(int, struct vm_create_params *);
97 int dump_send_header(int);
98 int dump_vmr(int , struct vm_mem_range *);
99 int dump_mem(int, struct vm_create_params *);
100 void restore_vmr(int, struct vm_mem_range *);
101 void restore_mem(int, struct vm_create_params *);
102 int restore_vm_params(int, struct vm_create_params *);
103 void pause_vm(struct vm_create_params *);
104 void unpause_vm(struct vm_create_params *);
105 
106 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
107 
108 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
109     size_t);
110 
111 int con_fd;
112 struct vmd_vm *current_vm;
113 
114 extern struct vmd *env;
115 
116 extern char *__progname;
117 
118 pthread_mutex_t threadmutex;
119 pthread_cond_t threadcond;
120 
121 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
122 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
123 pthread_barrier_t vm_pause_barrier;
124 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM];
125 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM];
126 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
127 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
128 
129 /*
130  * Represents a standard register set for an OS to be booted
131  * as a flat 64 bit address space.
132  *
133  * NOT set here are:
134  *  RIP
135  *  RSP
136  *  GDTR BASE
137  *
138  * Specific bootloaders should clone this structure and override
139  * those fields as needed.
140  *
141  * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
142  *        features of the CPU in use.
143  */
144 static const struct vcpu_reg_state vcpu_init_flat64 = {
145 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
146 	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
147 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
148 	.vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
149 	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
150 	.vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
151 	.vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
152 	.vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
153 	.vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
154 	.vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
155 	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
156 	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
157 	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
158 	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
159 	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
160 	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
161 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
162 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
163 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
164 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
165 	.vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
166 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
167 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
168 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
169 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
170 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
171 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
172 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
173 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
174 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
175 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
176 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
177 	.vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
178 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
179 };
180 
181 /*
182  * Represents a standard register set for an BIOS to be booted
183  * as a flat 16 bit address space.
184  */
185 static const struct vcpu_reg_state vcpu_init_flat16 = {
186 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
187 	.vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
188 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
189 	.vrs_crs[VCPU_REGS_CR0] = 0x60000010,
190 	.vrs_crs[VCPU_REGS_CR3] = 0,
191 	.vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
192 	.vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
193 	.vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
194 	.vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
195 	.vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
196 	.vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
197 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
198 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
199 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
200 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
201 	.vrs_msrs[VCPU_REGS_EFER] = 0ULL,
202 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
203 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
204 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
205 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
206 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
207 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
208 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
209 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
210 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
211 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
212 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
213 	.vrs_crs[VCPU_REGS_XCR0] = XCR0_X87
214 };
215 
216 /*
217  * loadfile_bios
218  *
219  * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
220  * directly into memory.
221  *
222  * Parameters:
223  *  fp: file of a kernel file to load
224  *  size: uncompressed size of the image
225  *  (out) vrs: register state to set on init for this kernel
226  *
227  * Return values:
228  *  0 if successful
229  *  various error codes returned from read(2) or loadelf functions
230  */
231 int
232 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
233 {
234 	off_t	 off;
235 
236 	/* Set up a "flat 16 bit" register state for BIOS */
237 	memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
238 
239 	/* Seek to the beginning of the BIOS image */
240 	if (gzseek(fp, 0, SEEK_SET) == -1)
241 		return (-1);
242 
243 	/* The BIOS image must end at 1MB */
244 	if ((off = MB(1) - size) < 0)
245 		return (-1);
246 
247 	/* Read BIOS image into memory */
248 	if (mread(fp, off, size) != (size_t)size) {
249 		errno = EIO;
250 		return (-1);
251 	}
252 
253 	if (gzseek(fp, 0, SEEK_SET) == -1)
254 		return (-1);
255 
256 	/* Read a second BIOS copy into memory ending at 4GB */
257 	off = GB(4) - size;
258 	if (mread(fp, off, size) != (size_t)size) {
259 		errno = EIO;
260 		return (-1);
261 	}
262 
263 	log_debug("%s: loaded BIOS image", __func__);
264 
265 	return (0);
266 }
267 
268 /*
269  * start_vm
270  *
271  * After forking a new VM process, starts the new VM with the creation
272  * parameters supplied (in the incoming vm->vm_params field). This
273  * function performs a basic sanity check on the incoming parameters
274  * and then performs the following steps to complete the creation of the VM:
275  *
276  * 1. validates and create the new VM
277  * 2. opens the imsg control channel to the parent and drops more privilege
278  * 3. drops additional privileges by calling pledge(2)
279  * 4. loads the kernel from the disk image or file descriptor
280  * 5. runs the VM's VCPU loops.
281  *
282  * Parameters:
283  *  vm: The VM data structure that is including the VM create parameters.
284  *  fd: The imsg socket that is connected to the parent process.
285  *
286  * Return values:
287  *  0: success
288  *  !0 : failure - typically an errno indicating the source of the failure
289  */
290 int
291 start_vm(struct vmd_vm *vm, int fd)
292 {
293 	struct vmop_create_params *vmc = &vm->vm_params;
294 	struct vm_create_params	*vcp = &vmc->vmc_params;
295 	struct vcpu_reg_state	 vrs;
296 	int			 nicfds[VMM_MAX_NICS_PER_VM];
297 	int			 ret;
298 	gzFile			 fp;
299 	size_t			 i;
300 	struct vm_rwregs_params  vrp;
301 	struct stat		 sb;
302 
303 	/* Child */
304 	setproctitle("%s", vcp->vcp_name);
305 	log_procinit(vcp->vcp_name);
306 
307 	if (!(vm->vm_state & VM_STATE_RECEIVED))
308 		create_memory_map(vcp);
309 
310 	ret = alloc_guest_mem(vcp);
311 
312 	if (ret) {
313 		struct rlimit lim;
314 		char buf[FMT_SCALED_STRSIZE];
315 		if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) {
316 			if (fmt_scaled(lim.rlim_cur, buf) == 0)
317 				fatalx("could not allocate guest memory (data "
318 				    "limit is %s)", buf);
319 		}
320 		errno = ret;
321 		fatal("could not allocate guest memory");
322 	}
323 
324 	ret = vmm_create_vm(vcp);
325 	current_vm = vm;
326 
327 	/* send back the kernel-generated vm id (0 on error) */
328 	if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
329 	    sizeof(vcp->vcp_id))
330 		fatal("failed to send created vm id to vmm process");
331 
332 	if (ret) {
333 		errno = ret;
334 		fatal("create vmm ioctl failed - exiting");
335 	}
336 
337 	/*
338 	 * pledge in the vm processes:
339 	 * stdio - for malloc and basic I/O including events.
340 	 * recvfd - for send/recv.
341 	 * vmm - for the vmm ioctls and operations.
342 	 */
343 	if (pledge("stdio vmm recvfd", NULL) == -1)
344 		fatal("pledge");
345 
346 	if (vm->vm_state & VM_STATE_RECEIVED) {
347 		ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp));
348 		if (ret != sizeof(vrp))
349 			fatal("received incomplete vrp - exiting");
350 		vrs = vrp.vrwp_regs;
351 	} else {
352 		/*
353 		 * Set up default "flat 64 bit" register state - RIP,
354 		 * RSP, and GDT info will be set in bootloader
355 		 */
356 		memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs));
357 
358 		/* Find and open kernel image */
359 		if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL)
360 			fatalx("failed to open kernel - exiting");
361 
362 		/* Load kernel image */
363 		ret = loadfile_elf(fp, vcp, &vrs, vmc->vmc_bootdevice);
364 
365 		/*
366 		 * Try BIOS as a fallback (only if it was provided as an image
367 		 * with vm->vm_kernel and the file is not compressed)
368 		 */
369 		if (ret && errno == ENOEXEC && vm->vm_kernel != -1 &&
370 		    gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
371 			ret = loadfile_bios(fp, sb.st_size, &vrs);
372 
373 		if (ret)
374 			fatal("failed to load kernel or BIOS - exiting");
375 
376 		gzclose(fp);
377 	}
378 
379 	if (vm->vm_kernel != -1)
380 		close(vm->vm_kernel);
381 
382 	con_fd = vm->vm_tty;
383 	if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
384 		fatal("failed to set nonblocking mode on console");
385 
386 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
387 		nicfds[i] = vm->vm_ifs[i].vif_fd;
388 
389 	event_init();
390 
391 	if (vm->vm_state & VM_STATE_RECEIVED) {
392 		restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
393 		    vm->vm_disks, vm->vm_cdrom);
394 		restore_mem(vm->vm_receive_fd, vcp);
395 		if (restore_vm_params(vm->vm_receive_fd, vcp))
396 			fatal("restore vm params failed");
397 		unpause_vm(vcp);
398 	}
399 
400 	if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
401 		fatal("setup vm pipe");
402 
403 	/* Execute the vcpu run loop(s) for this VM */
404 	ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs);
405 
406 	/* Ensure that any in-flight data is written back */
407 	virtio_shutdown(vm);
408 
409 	return (ret);
410 }
411 
412 /*
413  * vm_dispatch_vmm
414  *
415  * imsg callback for messages that are received from the vmm parent process.
416  */
417 void
418 vm_dispatch_vmm(int fd, short event, void *arg)
419 {
420 	struct vmd_vm		*vm = arg;
421 	struct vmop_result	 vmr;
422 	struct vmop_addr_result	 var;
423 	struct imsgev		*iev = &vm->vm_iev;
424 	struct imsgbuf		*ibuf = &iev->ibuf;
425 	struct imsg		 imsg;
426 	ssize_t			 n;
427 	int			 verbose;
428 
429 	if (event & EV_READ) {
430 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
431 			fatal("%s: imsg_read", __func__);
432 		if (n == 0)
433 			_exit(0);
434 	}
435 
436 	if (event & EV_WRITE) {
437 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
438 			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
439 		if (n == 0)
440 			_exit(0);
441 	}
442 
443 	for (;;) {
444 		if ((n = imsg_get(ibuf, &imsg)) == -1)
445 			fatal("%s: imsg_get", __func__);
446 		if (n == 0)
447 			break;
448 
449 #if DEBUG > 1
450 		log_debug("%s: got imsg %d from %s",
451 		    __func__, imsg.hdr.type,
452 		    vm->vm_params.vmc_params.vcp_name);
453 #endif
454 
455 		switch (imsg.hdr.type) {
456 		case IMSG_CTL_VERBOSE:
457 			IMSG_SIZE_CHECK(&imsg, &verbose);
458 			memcpy(&verbose, imsg.data, sizeof(verbose));
459 			log_setverbose(verbose);
460 			break;
461 		case IMSG_VMDOP_VM_SHUTDOWN:
462 			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
463 				_exit(0);
464 			break;
465 		case IMSG_VMDOP_VM_REBOOT:
466 			if (vmmci_ctl(VMMCI_REBOOT) == -1)
467 				_exit(0);
468 			break;
469 		case IMSG_VMDOP_PAUSE_VM:
470 			vmr.vmr_result = 0;
471 			vmr.vmr_id = vm->vm_vmid;
472 			pause_vm(&vm->vm_params.vmc_params);
473 			imsg_compose_event(&vm->vm_iev,
474 			    IMSG_VMDOP_PAUSE_VM_RESPONSE,
475 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
476 			    sizeof(vmr));
477 			break;
478 		case IMSG_VMDOP_UNPAUSE_VM:
479 			vmr.vmr_result = 0;
480 			vmr.vmr_id = vm->vm_vmid;
481 			unpause_vm(&vm->vm_params.vmc_params);
482 			imsg_compose_event(&vm->vm_iev,
483 			    IMSG_VMDOP_UNPAUSE_VM_RESPONSE,
484 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
485 			    sizeof(vmr));
486 			break;
487 		case IMSG_VMDOP_SEND_VM_REQUEST:
488 			vmr.vmr_id = vm->vm_vmid;
489 			vmr.vmr_result = send_vm(imsg.fd,
490 			    &vm->vm_params.vmc_params);
491 			imsg_compose_event(&vm->vm_iev,
492 			    IMSG_VMDOP_SEND_VM_RESPONSE,
493 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
494 			    sizeof(vmr));
495 			if (!vmr.vmr_result) {
496 				imsg_flush(&current_vm->vm_iev.ibuf);
497 				_exit(0);
498 			}
499 			break;
500 		case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
501 			IMSG_SIZE_CHECK(&imsg, &var);
502 			memcpy(&var, imsg.data, sizeof(var));
503 
504 			log_debug("%s: received tap addr %s for nic %d",
505 			    vm->vm_params.vmc_params.vcp_name,
506 			    ether_ntoa((void *)var.var_addr), var.var_nic_idx);
507 
508 			vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr);
509 			break;
510 		default:
511 			fatalx("%s: got invalid imsg %d from %s",
512 			    __func__, imsg.hdr.type,
513 			    vm->vm_params.vmc_params.vcp_name);
514 		}
515 		imsg_free(&imsg);
516 	}
517 	imsg_event_add(iev);
518 }
519 
520 /*
521  * vm_shutdown
522  *
523  * Tell the vmm parent process to shutdown or reboot the VM and exit.
524  */
525 __dead void
526 vm_shutdown(unsigned int cmd)
527 {
528 	switch (cmd) {
529 	case VMMCI_NONE:
530 	case VMMCI_SHUTDOWN:
531 		(void)imsg_compose_event(&current_vm->vm_iev,
532 		    IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0);
533 		break;
534 	case VMMCI_REBOOT:
535 		(void)imsg_compose_event(&current_vm->vm_iev,
536 		    IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0);
537 		break;
538 	default:
539 		fatalx("invalid vm ctl command: %d", cmd);
540 	}
541 	imsg_flush(&current_vm->vm_iev.ibuf);
542 
543 	_exit(0);
544 }
545 
546 int
547 send_vm(int fd, struct vm_create_params *vcp)
548 {
549 	struct vm_rwregs_params	   vrp;
550 	struct vm_rwvmparams_params vpp;
551 	struct vmop_create_params *vmc;
552 	struct vm_terminate_params vtp;
553 	unsigned int		   flags = 0;
554 	unsigned int		   i;
555 	int			   ret = 0;
556 	size_t			   sz;
557 
558 	if (dump_send_header(fd)) {
559 		log_info("%s: failed to send vm dump header", __func__);
560 		goto err;
561 	}
562 
563 	pause_vm(vcp);
564 
565 	vmc = calloc(1, sizeof(struct vmop_create_params));
566 	if (vmc == NULL) {
567 		log_warn("%s: calloc error getting vmc", __func__);
568 		ret = -1;
569 		goto err;
570 	}
571 
572 	flags |= VMOP_CREATE_MEMORY;
573 	memcpy(&vmc->vmc_params, &current_vm->vm_params, sizeof(struct
574 	    vmop_create_params));
575 	vmc->vmc_flags = flags;
576 	vrp.vrwp_vm_id = vcp->vcp_id;
577 	vrp.vrwp_mask = VM_RWREGS_ALL;
578 	vpp.vpp_mask = VM_RWVMPARAMS_ALL;
579 	vpp.vpp_vm_id = vcp->vcp_id;
580 
581 	sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params));
582 	if (sz != sizeof(struct vmop_create_params)) {
583 		ret = -1;
584 		goto err;
585 	}
586 
587 	for (i = 0; i < vcp->vcp_ncpus; i++) {
588 		vrp.vrwp_vcpu_id = i;
589 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) {
590 			log_warn("%s: readregs failed", __func__);
591 			goto err;
592 		}
593 
594 		sz = atomicio(vwrite, fd, &vrp,
595 		    sizeof(struct vm_rwregs_params));
596 		if (sz != sizeof(struct vm_rwregs_params)) {
597 			log_warn("%s: dumping registers failed", __func__);
598 			ret = -1;
599 			goto err;
600 		}
601 	}
602 
603 	if ((ret = i8253_dump(fd)))
604 		goto err;
605 	if ((ret = i8259_dump(fd)))
606 		goto err;
607 	if ((ret = ns8250_dump(fd)))
608 		goto err;
609 	if ((ret = mc146818_dump(fd)))
610 		goto err;
611 	if ((ret = fw_cfg_dump(fd)))
612 		goto err;
613 	if ((ret = pci_dump(fd)))
614 		goto err;
615 	if ((ret = virtio_dump(fd)))
616 		goto err;
617 	if ((ret = dump_mem(fd, vcp)))
618 		goto err;
619 
620 	for (i = 0; i < vcp->vcp_ncpus; i++) {
621 		vpp.vpp_vcpu_id = i;
622 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) {
623 			log_warn("%s: readvmparams failed", __func__);
624 			goto err;
625 		}
626 
627 		sz = atomicio(vwrite, fd, &vpp,
628 		    sizeof(struct vm_rwvmparams_params));
629 		if (sz != sizeof(struct vm_rwvmparams_params)) {
630 			log_warn("%s: dumping vm params failed", __func__);
631 			ret = -1;
632 			goto err;
633 		}
634 	}
635 
636 	vtp.vtp_vm_id = vcp->vcp_id;
637 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) {
638 		log_warnx("%s: term IOC error: %d, %d", __func__,
639 		    errno, ENOENT);
640 	}
641 err:
642 	close(fd);
643 	if (ret)
644 		unpause_vm(vcp);
645 	return ret;
646 }
647 
648 int
649 dump_send_header(int fd) {
650 	struct vm_dump_header	   vmh;
651 	int			   i;
652 
653 	memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
654 	    sizeof(vmh.vmh_signature));
655 
656 	vmh.vmh_cpuids[0].code = 0x00;
657 	vmh.vmh_cpuids[0].leaf = 0x00;
658 
659 	vmh.vmh_cpuids[1].code = 0x01;
660 	vmh.vmh_cpuids[1].leaf = 0x00;
661 
662 	vmh.vmh_cpuids[2].code = 0x07;
663 	vmh.vmh_cpuids[2].leaf = 0x00;
664 
665 	vmh.vmh_cpuids[3].code = 0x0d;
666 	vmh.vmh_cpuids[3].leaf = 0x00;
667 
668 	vmh.vmh_cpuids[4].code = 0x80000001;
669 	vmh.vmh_cpuids[4].leaf = 0x00;
670 
671 	vmh.vmh_version = VM_DUMP_VERSION;
672 
673 	for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
674 		CPUID_LEAF(vmh.vmh_cpuids[i].code,
675 		    vmh.vmh_cpuids[i].leaf,
676 		    vmh.vmh_cpuids[i].a,
677 		    vmh.vmh_cpuids[i].b,
678 		    vmh.vmh_cpuids[i].c,
679 		    vmh.vmh_cpuids[i].d);
680 	}
681 
682 	if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
683 		return (-1);
684 
685 	return (0);
686 }
687 
688 int
689 dump_mem(int fd, struct vm_create_params *vcp)
690 {
691 	unsigned int	i;
692 	int		ret;
693 	struct		vm_mem_range *vmr;
694 
695 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
696 		vmr = &vcp->vcp_memranges[i];
697 		ret = dump_vmr(fd, vmr);
698 		if (ret)
699 			return ret;
700 	}
701 	return (0);
702 }
703 
704 int
705 restore_vm_params(int fd, struct vm_create_params *vcp) {
706 	unsigned int			i;
707 	struct vm_rwvmparams_params    vpp;
708 
709 	for (i = 0; i < vcp->vcp_ncpus; i++) {
710 		if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) {
711 			log_warn("%s: error restoring vm params", __func__);
712 			return (-1);
713 		}
714 		vpp.vpp_vm_id = vcp->vcp_id;
715 		vpp.vpp_vcpu_id = i;
716 		if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) {
717 			log_debug("%s: writing vm params failed", __func__);
718 			return (-1);
719 		}
720 	}
721 	return (0);
722 }
723 
724 void
725 restore_mem(int fd, struct vm_create_params *vcp)
726 {
727 	unsigned int	     i;
728 	struct vm_mem_range *vmr;
729 
730 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
731 		vmr = &vcp->vcp_memranges[i];
732 		restore_vmr(fd, vmr);
733 	}
734 }
735 
736 int
737 dump_vmr(int fd, struct vm_mem_range *vmr)
738 {
739 	size_t	rem = vmr->vmr_size, read=0;
740 	char	buf[PAGE_SIZE];
741 
742 	while (rem > 0) {
743 		if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) {
744 			log_warn("failed to read vmr");
745 			return (-1);
746 		}
747 		if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) {
748 			log_warn("failed to dump vmr");
749 			return (-1);
750 		}
751 		rem = rem - PAGE_SIZE;
752 		read = read + PAGE_SIZE;
753 	}
754 	return (0);
755 }
756 
757 void
758 restore_vmr(int fd, struct vm_mem_range *vmr)
759 {
760 	size_t	rem = vmr->vmr_size, wrote=0;
761 	char	buf[PAGE_SIZE];
762 
763 	while (rem > 0) {
764 		if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf))
765 			fatal("failed to restore vmr");
766 		if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE))
767 			fatal("failed to write vmr");
768 		rem = rem - PAGE_SIZE;
769 		wrote = wrote + PAGE_SIZE;
770 	}
771 }
772 
773 void
774 pause_vm(struct vm_create_params *vcp)
775 {
776 	unsigned int n;
777 	int ret;
778 	if (current_vm->vm_state & VM_STATE_PAUSED)
779 		return;
780 
781 	current_vm->vm_state |= VM_STATE_PAUSED;
782 
783 	ret = pthread_barrier_init(&vm_pause_barrier, NULL, vcp->vcp_ncpus + 1);
784 	if (ret) {
785 		log_warnx("%s: cannot initialize pause barrier (%d)",
786 		    __progname, ret);
787 		return;
788 	}
789 
790 	for (n = 0; n < vcp->vcp_ncpus; n++) {
791 		ret = pthread_cond_broadcast(&vcpu_run_cond[n]);
792 		if (ret) {
793 			log_warnx("%s: can't broadcast vcpu run cond (%d)",
794 			    __func__, (int)ret);
795 			return;
796 		}
797 	}
798 	ret = pthread_barrier_wait(&vm_pause_barrier);
799 	if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
800 		log_warnx("%s: could not wait on pause barrier (%d)",
801 		    __func__, (int)ret);
802 		return;
803 	}
804 
805 	ret = pthread_barrier_destroy(&vm_pause_barrier);
806 	if (ret) {
807 		log_warnx("%s: could not destroy pause barrier (%d)",
808 		    __progname, ret);
809 		return;
810 	}
811 
812 	i8253_stop();
813 	mc146818_stop();
814 	ns8250_stop();
815 	virtio_stop(vcp);
816 }
817 
818 void
819 unpause_vm(struct vm_create_params *vcp)
820 {
821 	unsigned int n;
822 	int ret;
823 	if (!(current_vm->vm_state & VM_STATE_PAUSED))
824 		return;
825 
826 	current_vm->vm_state &= ~VM_STATE_PAUSED;
827 	for (n = 0; n < vcp->vcp_ncpus; n++) {
828 		ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]);
829 		if (ret) {
830 			log_warnx("%s: can't broadcast vcpu unpause cond (%d)",
831 			    __func__, (int)ret);
832 			return;
833 		}
834 	}
835 
836 	i8253_start();
837 	mc146818_start();
838 	ns8250_start();
839 	virtio_start(vcp);
840 }
841 
842 /*
843  * vcpu_reset
844  *
845  * Requests vmm(4) to reset the VCPUs in the indicated VM to
846  * the register state provided
847  *
848  * Parameters
849  *  vmid: VM ID to reset
850  *  vcpu_id: VCPU ID to reset
851  *  vrs: the register state to initialize
852  *
853  * Return values:
854  *  0: success
855  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
856  *      valid)
857  */
858 int
859 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
860 {
861 	struct vm_resetcpu_params vrp;
862 
863 	memset(&vrp, 0, sizeof(vrp));
864 	vrp.vrp_vm_id = vmid;
865 	vrp.vrp_vcpu_id = vcpu_id;
866 	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
867 
868 	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
869 
870 	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1)
871 		return (errno);
872 
873 	return (0);
874 }
875 
876 /*
877  * create_memory_map
878  *
879  * Sets up the guest physical memory ranges that the VM can access.
880  *
881  * Parameters:
882  *  vcp: VM create parameters describing the VM whose memory map
883  *       is being created
884  *
885  * Return values:
886  *  nothing
887  */
888 void
889 create_memory_map(struct vm_create_params *vcp)
890 {
891 	size_t len, mem_bytes;
892 	size_t above_1m = 0, above_4g = 0;
893 
894 	mem_bytes = vcp->vcp_memranges[0].vmr_size;
895 	vcp->vcp_nmemranges = 0;
896 	if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE)
897 		return;
898 
899 	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
900 	len = LOWMEM_KB * 1024;
901 	vcp->vcp_memranges[0].vmr_gpa = 0x0;
902 	vcp->vcp_memranges[0].vmr_size = len;
903 	vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM;
904 	mem_bytes -= len;
905 
906 	/*
907 	 * Second memory region: LOWMEM_KB - 1MB.
908 	 *
909 	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
910 	 * We have to add this region, because some systems
911 	 * unconditionally write to 0xb8000 (VGA RAM), and
912 	 * we need to make sure that vmm(4) permits accesses
913 	 * to it. So allocate guest memory for it.
914 	 */
915 	len = MB(1) - (LOWMEM_KB * 1024);
916 	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
917 	vcp->vcp_memranges[1].vmr_size = len;
918 	vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED;
919 	mem_bytes -= len;
920 
921 	/* If we have less than 2MB remaining, still create a 2nd BIOS area. */
922 	if (mem_bytes <= MB(2)) {
923 		vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END;
924 		vcp->vcp_memranges[2].vmr_size = MB(2);
925 		vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED;
926 		vcp->vcp_nmemranges = 3;
927 		return;
928 	}
929 
930 	/*
931 	 * Calculate the how to split any remaining memory across the 4GB
932 	 * boundary while making sure we do not place physical memory into
933 	 * MMIO ranges.
934 	 */
935 	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) {
936 		above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1);
937 		above_4g = mem_bytes - above_1m;
938 	} else {
939 		above_1m = mem_bytes;
940 		above_4g = 0;
941 	}
942 
943 	/* Third memory region: area above 1MB to MMIO region */
944 	vcp->vcp_memranges[2].vmr_gpa = MB(1);
945 	vcp->vcp_memranges[2].vmr_size = above_1m;
946 	vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM;
947 
948 	/* Fourth region: PCI MMIO range */
949 	vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_BASE;
950 	vcp->vcp_memranges[3].vmr_size = VMM_PCI_MMIO_BAR_END -
951 	    VMM_PCI_MMIO_BAR_BASE + 1;
952 	vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO;
953 
954 	/* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */
955 	vcp->vcp_memranges[4].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
956 	vcp->vcp_memranges[4].vmr_size = MB(2);
957 	vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED;
958 
959 	/* Sixth region: any remainder above 4GB */
960 	if (above_4g > 0) {
961 		vcp->vcp_memranges[5].vmr_gpa = GB(4);
962 		vcp->vcp_memranges[5].vmr_size = above_4g;
963 		vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM;
964 		vcp->vcp_nmemranges = 6;
965 	} else
966 		vcp->vcp_nmemranges = 5;
967 }
968 
969 /*
970  * alloc_guest_mem
971  *
972  * Allocates memory for the guest.
973  * Instead of doing a single allocation with one mmap(), we allocate memory
974  * separately for every range for the following reasons:
975  * - ASLR for the individual ranges
976  * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
977  *   map the single mmap'd userspace memory to the individual guest physical
978  *   memory ranges, the underlying amap of the single mmap'd range would have
979  *   to allocate per-page reference counters. The reason is that the
980  *   individual guest physical ranges would reference the single mmap'd region
981  *   only partially. However, if every guest physical range has its own
982  *   corresponding mmap'd userspace allocation, there are no partial
983  *   references: every guest physical range fully references an mmap'd
984  *   range => no per-page reference counters have to be allocated.
985  *
986  * Return values:
987  *  0: success
988  *  !0: failure - errno indicating the source of the failure
989  */
990 int
991 alloc_guest_mem(struct vm_create_params *vcp)
992 {
993 	void *p;
994 	int ret;
995 	size_t i, j;
996 	struct vm_mem_range *vmr;
997 
998 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
999 		vmr = &vcp->vcp_memranges[i];
1000 		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
1001 		    MAP_PRIVATE | MAP_ANON, -1, 0);
1002 		if (p == MAP_FAILED) {
1003 			ret = errno;
1004 			for (j = 0; j < i; j++) {
1005 				vmr = &vcp->vcp_memranges[j];
1006 				munmap((void *)vmr->vmr_va, vmr->vmr_size);
1007 			}
1008 
1009 			return (ret);
1010 		}
1011 
1012 		vmr->vmr_va = (vaddr_t)p;
1013 	}
1014 
1015 	return (0);
1016 }
1017 
1018 /*
1019  * vmm_create_vm
1020  *
1021  * Requests vmm(4) to create a new VM using the supplied creation
1022  * parameters. This operation results in the creation of the in-kernel
1023  * structures for the VM, but does not start the VM's vcpu(s).
1024  *
1025  * Parameters:
1026  *  vcp: vm_create_params struct containing the VM's desired creation
1027  *      configuration
1028  *
1029  * Return values:
1030  *  0: success
1031  *  !0 : ioctl to vmm(4) failed
1032  */
1033 int
1034 vmm_create_vm(struct vm_create_params *vcp)
1035 {
1036 	/* Sanity check arguments */
1037 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
1038 		return (EINVAL);
1039 
1040 	if (vcp->vcp_nmemranges == 0 ||
1041 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1042 		return (EINVAL);
1043 
1044 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
1045 		return (EINVAL);
1046 
1047 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
1048 		return (EINVAL);
1049 
1050 	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1)
1051 		return (errno);
1052 
1053 	return (0);
1054 }
1055 
1056 /*
1057  * init_emulated_hw
1058  *
1059  * Initializes the userspace hardware emulation
1060  */
1061 void
1062 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
1063     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1064 {
1065 	struct vm_create_params *vcp = &vmc->vmc_params;
1066 	size_t i;
1067 	uint64_t memlo, memhi;
1068 
1069 	/* Calculate memory size for NVRAM registers */
1070 	memlo = memhi = 0;
1071 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1072 		if (vcp->vcp_memranges[i].vmr_gpa == MB(1) &&
1073 		    vcp->vcp_memranges[i].vmr_size > (15 * MB(1)))
1074 			memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1));
1075 		else if (vcp->vcp_memranges[i].vmr_gpa == GB(4))
1076 			memhi = vcp->vcp_memranges[i].vmr_size;
1077 	}
1078 
1079 	/* Reset the IO port map */
1080 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1081 
1082 	/* Init i8253 PIT */
1083 	i8253_init(vcp->vcp_id);
1084 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1085 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1086 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1087 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1088 	ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
1089 
1090 	/* Init mc146818 RTC */
1091 	mc146818_init(vcp->vcp_id, memlo, memhi);
1092 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1093 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1094 
1095 	/* Init master and slave PICs */
1096 	i8259_init();
1097 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1098 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1099 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1100 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1101 	ioports_map[ELCR0] = vcpu_exit_elcr;
1102 	ioports_map[ELCR1] = vcpu_exit_elcr;
1103 
1104 	/* Init ns8250 UART */
1105 	ns8250_init(con_fd, vcp->vcp_id);
1106 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1107 		ioports_map[i] = vcpu_exit_com;
1108 
1109 	/* Init QEMU fw_cfg interface */
1110 	fw_cfg_init(vmc);
1111 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1112 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1113 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1114 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1115 
1116 	/* Initialize PCI */
1117 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
1118 		ioports_map[i] = vcpu_exit_pci;
1119 
1120 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1121 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1122 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1123 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1124 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1125 	pci_init();
1126 
1127 	/* Initialize virtio devices */
1128 	virtio_init(current_vm, child_cdrom, child_disks, child_taps);
1129 }
1130 /*
1131  * restore_emulated_hw
1132  *
1133  * Restores the userspace hardware emulation from fd
1134  */
1135 void
1136 restore_emulated_hw(struct vm_create_params *vcp, int fd,
1137     int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
1138 {
1139 	/* struct vm_create_params *vcp = &vmc->vmc_params; */
1140 	int i;
1141 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1142 
1143 	/* Init i8253 PIT */
1144 	i8253_restore(fd, vcp->vcp_id);
1145 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1146 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1147 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1148 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1149 
1150 	/* Init master and slave PICs */
1151 	i8259_restore(fd);
1152 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1153 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1154 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1155 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1156 
1157 	/* Init ns8250 UART */
1158 	ns8250_restore(fd, con_fd, vcp->vcp_id);
1159 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1160 		ioports_map[i] = vcpu_exit_com;
1161 
1162 	/* Init mc146818 RTC */
1163 	mc146818_restore(fd, vcp->vcp_id);
1164 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1165 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1166 
1167 	/* Init QEMU fw_cfg interface */
1168 	fw_cfg_restore(fd);
1169 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1170 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1171 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1172 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1173 
1174 	/* Initialize PCI */
1175 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
1176 		ioports_map[i] = vcpu_exit_pci;
1177 
1178 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1179 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1180 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1181 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1182 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1183 	pci_restore(fd);
1184 	virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
1185 }
1186 
1187 /*
1188  * run_vm
1189  *
1190  * Runs the VM whose creation parameters are specified in vcp
1191  *
1192  * Parameters:
1193  *  child_cdrom: previously-opened child ISO disk file descriptor
1194  *  child_disks: previously-opened child VM disk file file descriptors
1195  *  child_taps: previously-opened child tap file descriptors
1196  *  vmc: vmop_create_params struct containing the VM's desired creation
1197  *      configuration
1198  *  vrs: VCPU register state to initialize
1199  *
1200  * Return values:
1201  *  0: the VM exited normally
1202  *  !0 : the VM exited abnormally or failed to start
1203  */
1204 int
1205 run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK],
1206     int *child_taps, struct vmop_create_params *vmc,
1207     struct vcpu_reg_state *vrs)
1208 {
1209 	struct vm_create_params *vcp = &vmc->vmc_params;
1210 	struct vm_rwregs_params vregsp;
1211 	uint8_t evdone = 0;
1212 	size_t i;
1213 	int ret;
1214 	pthread_t *tid, evtid;
1215 	char tname[MAXCOMLEN + 1];
1216 	struct vm_run_params **vrp;
1217 	void *exit_status;
1218 
1219 	if (vcp == NULL)
1220 		return (EINVAL);
1221 
1222 	if (child_cdrom == -1 && strlen(vcp->vcp_cdrom))
1223 		return (EINVAL);
1224 
1225 	if (child_disks == NULL && vcp->vcp_ndisks != 0)
1226 		return (EINVAL);
1227 
1228 	if (child_taps == NULL && vcp->vcp_nnics != 0)
1229 		return (EINVAL);
1230 
1231 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
1232 		return (EINVAL);
1233 
1234 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
1235 		return (EINVAL);
1236 
1237 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
1238 		return (EINVAL);
1239 
1240 	if (vcp->vcp_nmemranges == 0 ||
1241 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1242 		return (EINVAL);
1243 
1244 	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
1245 	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
1246 	if (tid == NULL || vrp == NULL) {
1247 		log_warn("%s: memory allocation error - exiting.",
1248 		    __progname);
1249 		return (ENOMEM);
1250 	}
1251 
1252 	log_debug("%s: initializing hardware for vm %s", __func__,
1253 	    vcp->vcp_name);
1254 
1255 	if (!(current_vm->vm_state & VM_STATE_RECEIVED))
1256 		init_emulated_hw(vmc, child_cdrom, child_disks, child_taps);
1257 
1258 	ret = pthread_mutex_init(&threadmutex, NULL);
1259 	if (ret) {
1260 		log_warn("%s: could not initialize thread state mutex",
1261 		    __func__);
1262 		return (ret);
1263 	}
1264 	ret = pthread_cond_init(&threadcond, NULL);
1265 	if (ret) {
1266 		log_warn("%s: could not initialize thread state "
1267 		    "condition variable", __func__);
1268 		return (ret);
1269 	}
1270 
1271 	mutex_lock(&threadmutex);
1272 
1273 	log_debug("%s: starting vcpu threads for vm %s", __func__,
1274 	    vcp->vcp_name);
1275 
1276 	/*
1277 	 * Create and launch one thread for each VCPU. These threads may
1278 	 * migrate between PCPUs over time; the need to reload CPU state
1279 	 * in such situations is detected and performed by vmm(4) in the
1280 	 * kernel.
1281 	 */
1282 	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
1283 		vrp[i] = malloc(sizeof(struct vm_run_params));
1284 		if (vrp[i] == NULL) {
1285 			log_warn("%s: memory allocation error - "
1286 			    "exiting.", __progname);
1287 			/* caller will exit, so skip freeing */
1288 			return (ENOMEM);
1289 		}
1290 		vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit));
1291 		if (vrp[i]->vrp_exit == NULL) {
1292 			log_warn("%s: memory allocation error - "
1293 			    "exiting.", __progname);
1294 			/* caller will exit, so skip freeing */
1295 			return (ENOMEM);
1296 		}
1297 		vrp[i]->vrp_vm_id = vcp->vcp_id;
1298 		vrp[i]->vrp_vcpu_id = i;
1299 
1300 		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
1301 			log_warnx("%s: cannot reset VCPU %zu - exiting.",
1302 			    __progname, i);
1303 			return (EIO);
1304 		}
1305 
1306 		/* once more because reset_cpu changes regs */
1307 		if (current_vm->vm_state & VM_STATE_RECEIVED) {
1308 			vregsp.vrwp_vm_id = vcp->vcp_id;
1309 			vregsp.vrwp_vcpu_id = i;
1310 			vregsp.vrwp_regs = *vrs;
1311 			vregsp.vrwp_mask = VM_RWREGS_ALL;
1312 			if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS,
1313 			    &vregsp)) == -1) {
1314 				log_warn("%s: writeregs failed", __func__);
1315 				return (ret);
1316 			}
1317 		}
1318 
1319 		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
1320 		if (ret) {
1321 			log_warnx("%s: cannot initialize cond var (%d)",
1322 			    __progname, ret);
1323 			return (ret);
1324 		}
1325 
1326 		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
1327 		if (ret) {
1328 			log_warnx("%s: cannot initialize mtx (%d)",
1329 			    __progname, ret);
1330 			return (ret);
1331 		}
1332 
1333 		ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL);
1334 		if (ret) {
1335 			log_warnx("%s: cannot initialize unpause var (%d)",
1336 			    __progname, ret);
1337 			return (ret);
1338 		}
1339 
1340 		ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL);
1341 		if (ret) {
1342 			log_warnx("%s: cannot initialize unpause mtx (%d)",
1343 			    __progname, ret);
1344 			return (ret);
1345 		}
1346 
1347 		vcpu_hlt[i] = 0;
1348 
1349 		/* Start each VCPU run thread at vcpu_run_loop */
1350 		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
1351 		if (ret) {
1352 			/* caller will _exit after this return */
1353 			ret = errno;
1354 			log_warn("%s: could not create vcpu thread %zu",
1355 			    __func__, i);
1356 			return (ret);
1357 		}
1358 
1359 		snprintf(tname, sizeof(tname), "vcpu-%zu", i);
1360 		pthread_set_name_np(tid[i], tname);
1361 	}
1362 
1363 	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
1364 	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
1365 	if (ret) {
1366 		errno = ret;
1367 		log_warn("%s: could not create event thread", __func__);
1368 		return (ret);
1369 	}
1370 	pthread_set_name_np(evtid, "event");
1371 
1372 	for (;;) {
1373 		ret = pthread_cond_wait(&threadcond, &threadmutex);
1374 		if (ret) {
1375 			log_warn("%s: waiting on thread state condition "
1376 			    "variable failed", __func__);
1377 			return (ret);
1378 		}
1379 
1380 		/*
1381 		 * Did a VCPU thread exit with an error? => return the first one
1382 		 */
1383 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1384 			if (vcpu_done[i] == 0)
1385 				continue;
1386 
1387 			if (pthread_join(tid[i], &exit_status)) {
1388 				log_warn("%s: failed to join thread %zd - "
1389 				    "exiting", __progname, i);
1390 				return (EIO);
1391 			}
1392 
1393 			ret = (intptr_t)exit_status;
1394 		}
1395 
1396 		/* Did the event thread exit? => return with an error */
1397 		if (evdone) {
1398 			if (pthread_join(evtid, &exit_status)) {
1399 				log_warn("%s: failed to join event thread - "
1400 				    "exiting", __progname);
1401 				return (EIO);
1402 			}
1403 
1404 			log_warnx("%s: vm %d event thread exited "
1405 			    "unexpectedly", __progname, vcp->vcp_id);
1406 			return (EIO);
1407 		}
1408 
1409 		/* Did all VCPU threads exit successfully? => return */
1410 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1411 			if (vcpu_done[i] == 0)
1412 				break;
1413 		}
1414 		if (i == vcp->vcp_ncpus)
1415 			return (ret);
1416 
1417 		/* Some more threads to wait for, start over */
1418 	}
1419 
1420 	return (ret);
1421 }
1422 
1423 void *
1424 event_thread(void *arg)
1425 {
1426 	uint8_t *donep = arg;
1427 	intptr_t ret;
1428 
1429 	ret = event_dispatch();
1430 
1431 	mutex_lock(&threadmutex);
1432 	*donep = 1;
1433 	pthread_cond_signal(&threadcond);
1434 	mutex_unlock(&threadmutex);
1435 
1436 	return (void *)ret;
1437  }
1438 
1439 /*
1440  * vcpu_run_loop
1441  *
1442  * Runs a single VCPU until vmm(4) requires help handling an exit,
1443  * or the VM terminates.
1444  *
1445  * Parameters:
1446  *  arg: vcpu_run_params for the VCPU being run by this thread
1447  *
1448  * Return values:
1449  *  NULL: the VCPU shutdown properly
1450  *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1451  */
1452 void *
1453 vcpu_run_loop(void *arg)
1454 {
1455 	struct vm_run_params *vrp = (struct vm_run_params *)arg;
1456 	intptr_t ret = 0;
1457 	int irq;
1458 	uint32_t n;
1459 
1460 	vrp->vrp_continue = 0;
1461 	n = vrp->vrp_vcpu_id;
1462 
1463 	for (;;) {
1464 		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1465 
1466 		if (ret) {
1467 			log_warnx("%s: can't lock vcpu run mtx (%d)",
1468 			    __func__, (int)ret);
1469 			return ((void *)ret);
1470 		}
1471 
1472 		/* If we are halted and need to pause, pause */
1473 		if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) {
1474 			ret = pthread_barrier_wait(&vm_pause_barrier);
1475 			if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
1476 				log_warnx("%s: could not wait on pause barrier (%d)",
1477 				    __func__, (int)ret);
1478 				return ((void *)ret);
1479 			}
1480 
1481 			ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]);
1482 			if (ret) {
1483 				log_warnx("%s: can't lock vcpu unpause mtx (%d)",
1484 				    __func__, (int)ret);
1485 				return ((void *)ret);
1486 			}
1487 
1488 			ret = pthread_cond_wait(&vcpu_unpause_cond[n],
1489 			    &vcpu_unpause_mtx[n]);
1490 			if (ret) {
1491 				log_warnx(
1492 				    "%s: can't wait on unpause cond (%d)",
1493 				    __func__, (int)ret);
1494 				break;
1495 			}
1496 			ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]);
1497 			if (ret) {
1498 				log_warnx("%s: can't unlock unpause mtx (%d)",
1499 				    __func__, (int)ret);
1500 				break;
1501 			}
1502 		}
1503 
1504 		/* If we are halted and not paused, wait */
1505 		if (vcpu_hlt[n]) {
1506 			ret = pthread_cond_wait(&vcpu_run_cond[n],
1507 			    &vcpu_run_mtx[n]);
1508 
1509 			if (ret) {
1510 				log_warnx(
1511 				    "%s: can't wait on cond (%d)",
1512 				    __func__, (int)ret);
1513 				(void)pthread_mutex_unlock(
1514 				    &vcpu_run_mtx[n]);
1515 				break;
1516 			}
1517 		}
1518 
1519 		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1520 
1521 		if (ret) {
1522 			log_warnx("%s: can't unlock mutex on cond (%d)",
1523 			    __func__, (int)ret);
1524 			break;
1525 		}
1526 
1527 		if (vrp->vrp_irqready && i8259_is_pending()) {
1528 			irq = i8259_ack();
1529 			vrp->vrp_irq = irq;
1530 		} else
1531 			vrp->vrp_irq = 0xFFFF;
1532 
1533 		/* Still more pending? */
1534 		if (i8259_is_pending()) {
1535 			/*
1536 			 * XXX can probably avoid ioctls here by providing intr
1537 			 * in vrp
1538 			 */
1539 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1540 			    vrp->vrp_vcpu_id, 1)) {
1541 				fatal("can't set INTR");
1542 			}
1543 		} else {
1544 			if (vcpu_pic_intr(vrp->vrp_vm_id,
1545 			    vrp->vrp_vcpu_id, 0)) {
1546 				fatal("can't clear INTR");
1547 			}
1548 		}
1549 
1550 		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) {
1551 			/* If run ioctl failed, exit */
1552 			ret = errno;
1553 			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1554 			    __func__, vrp->vrp_vm_id, n);
1555 			break;
1556 		}
1557 
1558 		/* If the VM is terminating, exit normally */
1559 		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
1560 			ret = (intptr_t)NULL;
1561 			break;
1562 		}
1563 
1564 		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
1565 			/*
1566 			 * vmm(4) needs help handling an exit, handle in
1567 			 * vcpu_exit.
1568 			 */
1569 			ret = vcpu_exit(vrp);
1570 			if (ret)
1571 				break;
1572 		}
1573 	}
1574 
1575 	mutex_lock(&threadmutex);
1576 	vcpu_done[n] = 1;
1577 	pthread_cond_signal(&threadcond);
1578 	mutex_unlock(&threadmutex);
1579 
1580 	return ((void *)ret);
1581 }
1582 
1583 int
1584 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1585 {
1586 	struct vm_intr_params vip;
1587 
1588 	memset(&vip, 0, sizeof(vip));
1589 
1590 	vip.vip_vm_id = vm_id;
1591 	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1592 	vip.vip_intr = intr;
1593 
1594 	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1)
1595 		return (errno);
1596 
1597 	return (0);
1598 }
1599 
1600 /*
1601  * vcpu_exit_pci
1602  *
1603  * Handle all I/O to the emulated PCI subsystem.
1604  *
1605  * Parameters:
1606  *  vrp: vcpu run parameters containing guest state for this exit
1607  *
1608  * Return value:
1609  *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
1610  *      be injected.
1611  */
1612 uint8_t
1613 vcpu_exit_pci(struct vm_run_params *vrp)
1614 {
1615 	struct vm_exit *vei = vrp->vrp_exit;
1616 	uint8_t intr;
1617 
1618 	intr = 0xFF;
1619 
1620 	switch (vei->vei.vei_port) {
1621 	case PCI_MODE1_ADDRESS_REG:
1622 		pci_handle_address_reg(vrp);
1623 		break;
1624 	case PCI_MODE1_DATA_REG:
1625 	case PCI_MODE1_DATA_REG + 1:
1626 	case PCI_MODE1_DATA_REG + 2:
1627 	case PCI_MODE1_DATA_REG + 3:
1628 		pci_handle_data_reg(vrp);
1629 		break;
1630 	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
1631 		intr = pci_handle_io(vrp);
1632 		break;
1633 	default:
1634 		log_warnx("%s: unknown PCI register 0x%llx",
1635 		    __progname, (uint64_t)vei->vei.vei_port);
1636 		break;
1637 	}
1638 
1639 	return (intr);
1640 }
1641 
1642 /*
1643  * vcpu_exit_inout
1644  *
1645  * Handle all I/O exits that need to be emulated in vmd. This includes the
1646  * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
1647  *
1648  * Parameters:
1649  *  vrp: vcpu run parameters containing guest state for this exit
1650  */
1651 void
1652 vcpu_exit_inout(struct vm_run_params *vrp)
1653 {
1654 	struct vm_exit *vei = vrp->vrp_exit;
1655 	uint8_t intr = 0xFF;
1656 
1657 	if (vei->vei.vei_rep || vei->vei.vei_string) {
1658 #ifdef MMIO_DEBUG
1659 		log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x",
1660 		    __func__,
1661 		    vei->vei.vei_rep == 0 ? "" : "REP ",
1662 		    vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT",
1663 		    vei->vei.vei_string == 0 ? "" : "S",
1664 		    vei->vei.vei_size, vei->vei.vei_encoding,
1665 		    vei->vei.vei_data, vei->vei.vei_port);
1666 		log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx",
1667 		    __func__,
1668 		    vei->vrs.vrs_gprs[VCPU_REGS_RCX],
1669 		    vei->vrs.vrs_gprs[VCPU_REGS_RDX],
1670 		    vei->vrs.vrs_gprs[VCPU_REGS_RSI]);
1671 #endif /* MMIO_DEBUG */
1672 		fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)",
1673 		    __func__);
1674 	}
1675 
1676 	if (ioports_map[vei->vei.vei_port] != NULL)
1677 		intr = ioports_map[vei->vei.vei_port](vrp);
1678 	else if (vei->vei.vei_dir == VEI_DIR_IN)
1679 		set_return_data(vei, 0xFFFFFFFF);
1680 
1681 	if (intr != 0xFF)
1682 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
1683 }
1684 
1685 /*
1686  * vcpu_exit_eptviolation
1687  *
1688  * handle an EPT Violation
1689  *
1690  * Parameters:
1691  *  vrp: vcpu run parameters containing guest state for this exit
1692  *
1693  * Return values:
1694  *  0: no action required
1695  *  EFAULT: a protection fault occured, kill the vm.
1696  */
1697 int
1698 vcpu_exit_eptviolation(struct vm_run_params *vrp)
1699 {
1700 	struct vm_exit *ve = vrp->vrp_exit;
1701 	int ret = 0;
1702 #if MMIO_NOTYET
1703 	struct x86_insn insn;
1704 	uint64_t va, pa;
1705 	size_t len = 15;		/* Max instruction length in x86. */
1706 #endif /* MMIO_NOTYET */
1707 	switch (ve->vee.vee_fault_type) {
1708 	case VEE_FAULT_HANDLED:
1709 		log_debug("%s: fault already handled", __func__);
1710 		break;
1711 
1712 #if MMIO_NOTYET
1713 	case VEE_FAULT_MMIO_ASSIST:
1714 		/* Intel VMX might give us the length of the instruction. */
1715 		if (ve->vee.vee_insn_info & VEE_LEN_VALID)
1716 			len = ve->vee.vee_insn_len;
1717 
1718 		if (len > 15)
1719 			fatalx("%s: invalid instruction length %lu", __func__,
1720 			    len);
1721 
1722 		/* If we weren't given instruction bytes, we need to fetch. */
1723 		if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) {
1724 			memset(ve->vee.vee_insn_bytes, 0,
1725 			    sizeof(ve->vee.vee_insn_bytes));
1726 			va = ve->vrs.vrs_gprs[VCPU_REGS_RIP];
1727 
1728 			/* XXX Only support instructions that fit on 1 page. */
1729 			if ((va & PAGE_MASK) + len > PAGE_SIZE) {
1730 				log_warnx("%s: instruction might cross page "
1731 				    "boundary", __func__);
1732 				ret = EINVAL;
1733 				break;
1734 			}
1735 
1736 			ret = translate_gva(ve, va, &pa, PROT_EXEC);
1737 			if (ret != 0) {
1738 				log_warnx("%s: failed gva translation",
1739 				    __func__);
1740 				break;
1741 			}
1742 
1743 			ret = read_mem(pa, ve->vee.vee_insn_bytes, len);
1744 			if (ret != 0) {
1745 				log_warnx("%s: failed to fetch instruction "
1746 				    "bytes from 0x%llx", __func__, pa);
1747 				break;
1748 			}
1749 		}
1750 
1751 		ret = insn_decode(ve, &insn);
1752 		if (ret == 0)
1753 			ret = insn_emulate(ve, &insn);
1754 		break;
1755 #endif /* MMIO_NOTYET */
1756 
1757 	case VEE_FAULT_PROTECT:
1758 		log_debug("%s: EPT Violation: rip=0x%llx", __progname,
1759 		    ve->vrs.vrs_gprs[VCPU_REGS_RIP]);
1760 		ret = EFAULT;
1761 		break;
1762 
1763 	default:
1764 		fatalx("%s: invalid fault_type %d", __progname,
1765 		    ve->vee.vee_fault_type);
1766 		/* UNREACHED */
1767 	}
1768 
1769 	return (ret);
1770 }
1771 
1772 /*
1773  * vcpu_exit
1774  *
1775  * Handle a vcpu exit. This function is called when it is determined that
1776  * vmm(4) requires the assistance of vmd to support a particular guest
1777  * exit type (eg, accessing an I/O port or device). Guest state is contained
1778  * in 'vrp', and will be resent to vmm(4) on exit completion.
1779  *
1780  * Upon conclusion of handling the exit, the function determines if any
1781  * interrupts should be injected into the guest, and asserts the proper
1782  * IRQ line whose interrupt should be vectored.
1783  *
1784  * Parameters:
1785  *  vrp: vcpu run parameters containing guest state for this exit
1786  *
1787  * Return values:
1788  *  0: the exit was handled successfully
1789  *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
1790  */
1791 int
1792 vcpu_exit(struct vm_run_params *vrp)
1793 {
1794 	int ret;
1795 
1796 	switch (vrp->vrp_exit_reason) {
1797 	case VMX_EXIT_INT_WINDOW:
1798 	case SVM_VMEXIT_VINTR:
1799 	case VMX_EXIT_CPUID:
1800 	case VMX_EXIT_EXTINT:
1801 	case SVM_VMEXIT_INTR:
1802 	case SVM_VMEXIT_MSR:
1803 	case SVM_VMEXIT_CPUID:
1804 		/*
1805 		 * We may be exiting to vmd to handle a pending interrupt but
1806 		 * at the same time the last exit type may have been one of
1807 		 * these. In this case, there's nothing extra to be done
1808 		 * here (and falling through to the default case below results
1809 		 * in more vmd log spam).
1810 		 */
1811 		break;
1812 	case SVM_VMEXIT_NPF:
1813 	case VMX_EXIT_EPT_VIOLATION:
1814 		ret = vcpu_exit_eptviolation(vrp);
1815 		if (ret)
1816 			return (ret);
1817 		break;
1818 	case VMX_EXIT_IO:
1819 	case SVM_VMEXIT_IOIO:
1820 		vcpu_exit_inout(vrp);
1821 		break;
1822 	case VMX_EXIT_HLT:
1823 	case SVM_VMEXIT_HLT:
1824 		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1825 		if (ret) {
1826 			log_warnx("%s: can't lock vcpu mutex (%d)",
1827 			    __func__, ret);
1828 			return (ret);
1829 		}
1830 		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
1831 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1832 		if (ret) {
1833 			log_warnx("%s: can't unlock vcpu mutex (%d)",
1834 			    __func__, ret);
1835 			return (ret);
1836 		}
1837 		break;
1838 	case VMX_EXIT_TRIPLE_FAULT:
1839 	case SVM_VMEXIT_SHUTDOWN:
1840 		/* reset VM */
1841 		return (EAGAIN);
1842 	default:
1843 		log_debug("%s: unknown exit reason 0x%x",
1844 		    __progname, vrp->vrp_exit_reason);
1845 	}
1846 
1847 	vrp->vrp_continue = 1;
1848 
1849 	return (0);
1850 }
1851 
1852 /*
1853  * find_gpa_range
1854  *
1855  * Search for a contiguous guest physical mem range.
1856  *
1857  * Parameters:
1858  *  vcp: VM create parameters that contain the memory map to search in
1859  *  gpa: the starting guest physical address
1860  *  len: the length of the memory range
1861  *
1862  * Return values:
1863  *  NULL: on failure if there is no memory range as described by the parameters
1864  *  Pointer to vm_mem_range that contains the start of the range otherwise.
1865  */
1866 static struct vm_mem_range *
1867 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
1868 {
1869 	size_t i, n;
1870 	struct vm_mem_range *vmr;
1871 
1872 	/* Find the first vm_mem_range that contains gpa */
1873 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1874 		vmr = &vcp->vcp_memranges[i];
1875 		if (gpa < vmr->vmr_gpa + vmr->vmr_size)
1876 			break;
1877 	}
1878 
1879 	/* No range found. */
1880 	if (i == vcp->vcp_nmemranges)
1881 		return (NULL);
1882 
1883 	/*
1884 	 * vmr may cover the range [gpa, gpa + len) only partly. Make
1885 	 * sure that the following vm_mem_ranges are contiguous and
1886 	 * cover the rest.
1887 	 */
1888 	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
1889 	if (len < n)
1890 		len = 0;
1891 	else
1892 		len -= n;
1893 	gpa = vmr->vmr_gpa + vmr->vmr_size;
1894 	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
1895 		vmr = &vcp->vcp_memranges[i];
1896 		if (gpa != vmr->vmr_gpa)
1897 			return (NULL);
1898 		if (len <= vmr->vmr_size)
1899 			len = 0;
1900 		else
1901 			len -= vmr->vmr_size;
1902 
1903 		gpa = vmr->vmr_gpa + vmr->vmr_size;
1904 	}
1905 
1906 	if (len != 0)
1907 		return (NULL);
1908 
1909 	return (vmr);
1910 }
1911 
1912 /*
1913  * write_mem
1914  *
1915  * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
1916  *
1917  * Parameters:
1918  *  dst: the destination paddr_t in the guest VM
1919  *  buf: data to copy (or NULL to zero the data)
1920  *  len: number of bytes to copy
1921  *
1922  * Return values:
1923  *  0: success
1924  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1925  *      exist in the guest.
1926  */
1927 int
1928 write_mem(paddr_t dst, const void *buf, size_t len)
1929 {
1930 	const char *from = buf;
1931 	char *to;
1932 	size_t n, off;
1933 	struct vm_mem_range *vmr;
1934 
1935 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
1936 	if (vmr == NULL) {
1937 		errno = EINVAL;
1938 		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
1939 		    "len = 0x%zx", __func__, dst, len);
1940 		return (EINVAL);
1941 	}
1942 
1943 	off = dst - vmr->vmr_gpa;
1944 	while (len != 0) {
1945 		n = vmr->vmr_size - off;
1946 		if (len < n)
1947 			n = len;
1948 
1949 		to = (char *)vmr->vmr_va + off;
1950 		if (buf == NULL)
1951 			memset(to, 0, n);
1952 		else {
1953 			memcpy(to, from, n);
1954 			from += n;
1955 		}
1956 		len -= n;
1957 		off = 0;
1958 		vmr++;
1959 	}
1960 
1961 	return (0);
1962 }
1963 
1964 /*
1965  * read_mem
1966  *
1967  * Reads memory at guest paddr 'src' into 'buf'.
1968  *
1969  * Parameters:
1970  *  src: the source paddr_t in the guest VM to read from.
1971  *  buf: destination (local) buffer
1972  *  len: number of bytes to read
1973  *
1974  * Return values:
1975  *  0: success
1976  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1977  *      exist in the guest.
1978  */
1979 int
1980 read_mem(paddr_t src, void *buf, size_t len)
1981 {
1982 	char *from, *to = buf;
1983 	size_t n, off;
1984 	struct vm_mem_range *vmr;
1985 
1986 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
1987 	if (vmr == NULL) {
1988 		errno = EINVAL;
1989 		log_warn("%s: failed - invalid memory range src = 0x%lx, "
1990 		    "len = 0x%zx", __func__, src, len);
1991 		return (EINVAL);
1992 	}
1993 
1994 	off = src - vmr->vmr_gpa;
1995 	while (len != 0) {
1996 		n = vmr->vmr_size - off;
1997 		if (len < n)
1998 			n = len;
1999 
2000 		from = (char *)vmr->vmr_va + off;
2001 		memcpy(to, from, n);
2002 
2003 		to += n;
2004 		len -= n;
2005 		off = 0;
2006 		vmr++;
2007 	}
2008 
2009 	return (0);
2010 }
2011 
2012 /*
2013  * hvaddr_mem
2014  *
2015  * Translate a guest physical address to a host virtual address, checking the
2016  * provided memory range length to confirm it's contiguous within the same
2017  * guest memory range (vm_mem_range).
2018  *
2019  * Parameters:
2020  *  gpa: guest physical address to translate
2021  *  len: number of bytes in the intended range
2022  *
2023  * Return values:
2024  *  void* to host virtual memory on success
2025  *  NULL on error, setting errno to:
2026  *    EFAULT: gpa falls outside guest memory ranges
2027  *    EINVAL: requested len extends beyond memory range
2028  */
2029 void *
2030 hvaddr_mem(paddr_t gpa, size_t len)
2031 {
2032 	struct vm_mem_range *vmr;
2033 	size_t off;
2034 
2035 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, gpa, len);
2036 	if (vmr == NULL) {
2037 		log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa);
2038 		errno = EFAULT;
2039 		return (NULL);
2040 	}
2041 
2042 	off = gpa - vmr->vmr_gpa;
2043 	if (len > (vmr->vmr_size - off)) {
2044 		log_warnx("%s: failed - invalid memory range: gpa=0x%lx, "
2045 		    "len=%zu", __func__, gpa, len);
2046 		errno = EINVAL;
2047 		return (NULL);
2048 	}
2049 
2050 	return ((char *)vmr->vmr_va + off);
2051 }
2052 
2053 /*
2054  * vcpu_assert_pic_irq
2055  *
2056  * Injects the specified IRQ on the supplied vcpu/vm
2057  *
2058  * Parameters:
2059  *  vm_id: VM ID to inject to
2060  *  vcpu_id: VCPU ID to inject to
2061  *  irq: IRQ to inject
2062  */
2063 void
2064 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
2065 {
2066 	int ret;
2067 
2068 	i8259_assert_irq(irq);
2069 
2070 	if (i8259_is_pending()) {
2071 		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
2072 			fatalx("%s: can't assert INTR", __func__);
2073 
2074 		ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
2075 		if (ret)
2076 			fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
2077 
2078 		vcpu_hlt[vcpu_id] = 0;
2079 		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
2080 		if (ret)
2081 			fatalx("%s: can't signal (%d)", __func__, ret);
2082 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
2083 		if (ret)
2084 			fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
2085 	}
2086 }
2087 
2088 /*
2089  * vcpu_deassert_pic_irq
2090  *
2091  * Clears the specified IRQ on the supplied vcpu/vm
2092  *
2093  * Parameters:
2094  *  vm_id: VM ID to clear in
2095  *  vcpu_id: VCPU ID to clear in
2096  *  irq: IRQ to clear
2097  */
2098 void
2099 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
2100 {
2101 	i8259_deassert_irq(irq);
2102 
2103 	if (!i8259_is_pending()) {
2104 		if (vcpu_pic_intr(vm_id, vcpu_id, 0))
2105 			fatalx("%s: can't deassert INTR for vm_id %d, "
2106 			    "vcpu_id %d", __func__, vm_id, vcpu_id);
2107 	}
2108 }
2109 
2110 /*
2111  * fd_hasdata
2112  *
2113  * Determines if data can be read from a file descriptor.
2114  *
2115  * Parameters:
2116  *  fd: the fd to check
2117  *
2118  * Return values:
2119  *  1 if data can be read from an fd, or 0 otherwise.
2120  */
2121 int
2122 fd_hasdata(int fd)
2123 {
2124 	struct pollfd pfd[1];
2125 	int nready, hasdata = 0;
2126 
2127 	pfd[0].fd = fd;
2128 	pfd[0].events = POLLIN;
2129 	nready = poll(pfd, 1, 0);
2130 	if (nready == -1)
2131 		log_warn("checking file descriptor for data failed");
2132 	else if (nready == 1 && pfd[0].revents & POLLIN)
2133 		hasdata = 1;
2134 	return (hasdata);
2135 }
2136 
2137 /*
2138  * mutex_lock
2139  *
2140  * Wrapper function for pthread_mutex_lock that does error checking and that
2141  * exits on failure
2142  */
2143 void
2144 mutex_lock(pthread_mutex_t *m)
2145 {
2146 	int ret;
2147 
2148 	ret = pthread_mutex_lock(m);
2149 	if (ret) {
2150 		errno = ret;
2151 		fatal("could not acquire mutex");
2152 	}
2153 }
2154 
2155 /*
2156  * mutex_unlock
2157  *
2158  * Wrapper function for pthread_mutex_unlock that does error checking and that
2159  * exits on failure
2160  */
2161 void
2162 mutex_unlock(pthread_mutex_t *m)
2163 {
2164 	int ret;
2165 
2166 	ret = pthread_mutex_unlock(m);
2167 	if (ret) {
2168 		errno = ret;
2169 		fatal("could not release mutex");
2170 	}
2171 }
2172 
2173 /*
2174  * set_return_data
2175  *
2176  * Utility function for manipulating register data in vm exit info structs. This
2177  * function ensures that the data is copied to the vei->vei.vei_data field with
2178  * the proper size for the operation being performed.
2179  *
2180  * Parameters:
2181  *  vei: exit information
2182  *  data: return data
2183  */
2184 void
2185 set_return_data(struct vm_exit *vei, uint32_t data)
2186 {
2187 	switch (vei->vei.vei_size) {
2188 	case 1:
2189 		vei->vei.vei_data &= ~0xFF;
2190 		vei->vei.vei_data |= (uint8_t)data;
2191 		break;
2192 	case 2:
2193 		vei->vei.vei_data &= ~0xFFFF;
2194 		vei->vei.vei_data |= (uint16_t)data;
2195 		break;
2196 	case 4:
2197 		vei->vei.vei_data = data;
2198 		break;
2199 	}
2200 }
2201 
2202 /*
2203  * get_input_data
2204  *
2205  * Utility function for manipulating register data in vm exit info
2206  * structs. This function ensures that the data is copied from the
2207  * vei->vei.vei_data field with the proper size for the operation being
2208  * performed.
2209  *
2210  * Parameters:
2211  *  vei: exit information
2212  *  data: location to store the result
2213  */
2214 void
2215 get_input_data(struct vm_exit *vei, uint32_t *data)
2216 {
2217 	switch (vei->vei.vei_size) {
2218 	case 1:
2219 		*data &= 0xFFFFFF00;
2220 		*data |= (uint8_t)vei->vei.vei_data;
2221 		break;
2222 	case 2:
2223 		*data &= 0xFFFF0000;
2224 		*data |= (uint16_t)vei->vei.vei_data;
2225 		break;
2226 	case 4:
2227 		*data = vei->vei.vei_data;
2228 		break;
2229 	default:
2230 		log_warnx("%s: invalid i/o size %d", __func__,
2231 		    vei->vei.vei_size);
2232 	}
2233 
2234 }
2235 
2236 /*
2237  * translate_gva
2238  *
2239  * Translates a guest virtual address to a guest physical address by walking
2240  * the currently active page table (if needed).
2241  *
2242  * XXX ensure translate_gva updates the A bit in the PTE
2243  * XXX ensure translate_gva respects segment base and limits in i386 mode
2244  * XXX ensure translate_gva respects segment wraparound in i8086 mode
2245  * XXX ensure translate_gva updates the A bit in the segment selector
2246  * XXX ensure translate_gva respects CR4.LMSLE if available
2247  *
2248  * Parameters:
2249  *  exit: The VCPU this translation should be performed for (guest MMU settings
2250  *   are gathered from this VCPU)
2251  *  va: virtual address to translate
2252  *  pa: pointer to paddr_t variable that will receive the translated physical
2253  *   address. 'pa' is unchanged on error.
2254  *  mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
2255  *   the address should be translated
2256  *
2257  * Return values:
2258  *  0: the address was successfully translated - 'pa' contains the physical
2259  *     address currently mapped by 'va'.
2260  *  EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
2261  *     and %cr2 set in the vcpu structure.
2262  *  EINVAL: an error occurred reading paging table structures
2263  */
2264 int
2265 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
2266 {
2267 	int level, shift, pdidx;
2268 	uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
2269 	uint64_t shift_width, pte_size;
2270 	struct vcpu_reg_state *vrs;
2271 
2272 	vrs = &exit->vrs;
2273 
2274 	if (!pa)
2275 		return (EINVAL);
2276 
2277 	if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
2278 		log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
2279 		*pa = va;
2280 		return (0);
2281 	}
2282 
2283 	pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
2284 
2285 	log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
2286 	    vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
2287 
2288 	if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
2289 		if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
2290 			pte_size = sizeof(uint64_t);
2291 			shift_width = 9;
2292 
2293 			if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
2294 				/* 4 level paging */
2295 				level = 4;
2296 				mask = L4_MASK;
2297 				shift = L4_SHIFT;
2298 			} else {
2299 				/* 32 bit with PAE paging */
2300 				level = 3;
2301 				mask = L3_MASK;
2302 				shift = L3_SHIFT;
2303 			}
2304 		} else {
2305 			/* 32 bit paging */
2306 			level = 2;
2307 			shift_width = 10;
2308 			mask = 0xFFC00000;
2309 			shift = 22;
2310 			pte_size = sizeof(uint32_t);
2311 		}
2312 	} else
2313 		return (EINVAL);
2314 
2315 	/* XXX: Check for R bit in segment selector and set A bit */
2316 
2317 	for (;level > 0; level--) {
2318 		pdidx = (va & mask) >> shift;
2319 		pte_paddr = (pt_paddr) + (pdidx * pte_size);
2320 
2321 		log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
2322 		    level, pte_paddr);
2323 		if (read_mem(pte_paddr, &pte, pte_size)) {
2324 			log_warn("%s: failed to read pte", __func__);
2325 			return (EFAULT);
2326 		}
2327 
2328 		log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
2329 		    pte);
2330 
2331 		/* XXX: Set CR2  */
2332 		if (!(pte & PG_V))
2333 			return (EFAULT);
2334 
2335 		/* XXX: Check for SMAP */
2336 		if ((mode == PROT_WRITE) && !(pte & PG_RW))
2337 			return (EPERM);
2338 
2339 		if ((exit->cpl > 0) && !(pte & PG_u))
2340 			return (EPERM);
2341 
2342 		pte = pte | PG_U;
2343 		if (mode == PROT_WRITE)
2344 			pte = pte | PG_M;
2345 		if (write_mem(pte_paddr, &pte, pte_size)) {
2346 			log_warn("%s: failed to write back flags to pte",
2347 			    __func__);
2348 			return (EIO);
2349 		}
2350 
2351 		/* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */
2352 		if (pte & PG_PS)
2353 			break;
2354 
2355 		if (level > 1) {
2356 			pt_paddr = pte & PG_FRAME;
2357 			shift -= shift_width;
2358 			mask = mask >> shift_width;
2359 		}
2360 	}
2361 
2362 	low_mask = (1 << shift) - 1;
2363 	high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
2364 	*pa = (pte & high_mask) | (va & low_mask);
2365 
2366 	log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
2367 
2368 	return (0);
2369 }
2370 
2371 /*
2372  * vm_pipe_init
2373  *
2374  * Initialize a vm_dev_pipe, setting up its file descriptors and its
2375  * event structure with the given callback.
2376  *
2377  * Parameters:
2378  *  p: pointer to vm_dev_pipe struct to initizlize
2379  *  cb: callback to use for READ events on the read end of the pipe
2380  */
2381 void
2382 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *))
2383 {
2384 	int ret;
2385 	int fds[2];
2386 
2387 	memset(p, 0, sizeof(struct vm_dev_pipe));
2388 
2389 	ret = pipe(fds);
2390 	if (ret)
2391 		fatal("failed to create vm_dev_pipe pipe");
2392 
2393 	p->read = fds[0];
2394 	p->write = fds[1];
2395 
2396 	event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL);
2397 }
2398 
2399 /*
2400  * vm_pipe_send
2401  *
2402  * Send a message to an emulated device vie the provided vm_dev_pipe.
2403  *
2404  * Parameters:
2405  *  p: pointer to initialized vm_dev_pipe
2406  *  msg: message to send in the channel
2407  */
2408 void
2409 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg)
2410 {
2411 	size_t n;
2412 	n = write(p->write, &msg, sizeof(msg));
2413 	if (n != sizeof(msg))
2414 		fatal("failed to write to device pipe");
2415 }
2416 
2417 /*
2418  * vm_pipe_recv
2419  *
2420  * Receive a message for an emulated device via the provided vm_dev_pipe.
2421  * Returns the message value, otherwise will exit on failure.
2422  *
2423  * Parameters:
2424  *  p: pointer to initialized vm_dev_pipe
2425  *
2426  * Return values:
2427  *  a value of enum pipe_msg_type or fatal exit on read(2) error
2428  */
2429 enum pipe_msg_type
2430 vm_pipe_recv(struct vm_dev_pipe *p)
2431 {
2432 	size_t n;
2433 	enum pipe_msg_type msg;
2434 	n = read(p->read, &msg, sizeof(msg));
2435 	if (n != sizeof(msg))
2436 		fatal("failed to read from device pipe");
2437 
2438 	return msg;
2439 }
2440