1 /* $OpenBSD: vm.c,v 1.88 2023/04/28 19:46:42 dv Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/param.h> /* PAGE_SIZE, MAXCOMLEN */ 20 #include <sys/types.h> 21 #include <sys/ioctl.h> 22 #include <sys/queue.h> 23 #include <sys/wait.h> 24 #include <sys/uio.h> 25 #include <sys/stat.h> 26 #include <sys/socket.h> 27 #include <sys/time.h> 28 #include <sys/mman.h> 29 #include <sys/resource.h> 30 31 #include <dev/ic/i8253reg.h> 32 #include <dev/isa/isareg.h> 33 #include <dev/pci/pcireg.h> 34 35 #include <machine/psl.h> 36 #include <machine/pte.h> 37 #include <machine/specialreg.h> 38 #include <machine/vmmvar.h> 39 40 #include <net/if.h> 41 42 #include <errno.h> 43 #include <event.h> 44 #include <fcntl.h> 45 #include <imsg.h> 46 #include <limits.h> 47 #include <poll.h> 48 #include <pthread.h> 49 #include <pthread_np.h> 50 #include <stddef.h> 51 #include <stdio.h> 52 #include <stdlib.h> 53 #include <string.h> 54 #include <unistd.h> 55 #include <util.h> 56 57 #include "atomicio.h" 58 #include "fw_cfg.h" 59 #include "i8253.h" 60 #include "i8259.h" 61 #include "loadfile.h" 62 #include "mc146818.h" 63 #include "mmio.h" 64 #include "ns8250.h" 65 #include "pci.h" 66 #include "virtio.h" 67 #include "vmd.h" 68 #include "vmm.h" 69 70 #define MB(x) (x * 1024UL * 1024UL) 71 #define GB(x) (x * 1024UL * 1024UL * 1024UL) 72 73 #define MMIO_NOTYET 0 74 75 io_fn_t ioports_map[MAX_PORTS]; 76 77 static int run_vm(struct vmop_create_params *, struct vcpu_reg_state *); 78 void vm_dispatch_vmm(int, short, void *); 79 void *event_thread(void *); 80 void *vcpu_run_loop(void *); 81 int vcpu_exit(struct vm_run_params *); 82 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); 83 void create_memory_map(struct vm_create_params *); 84 static int vmm_create_vm(struct vmd_vm *); 85 int alloc_guest_mem(struct vmd_vm *); 86 void init_emulated_hw(struct vmop_create_params *, int, 87 int[][VM_MAX_BASE_PER_DISK], int *); 88 void restore_emulated_hw(struct vm_create_params *, int, int *, 89 int[][VM_MAX_BASE_PER_DISK],int); 90 void vcpu_exit_inout(struct vm_run_params *); 91 int vcpu_exit_eptviolation(struct vm_run_params *); 92 uint8_t vcpu_exit_pci(struct vm_run_params *); 93 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); 94 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *); 95 static int send_vm(int, struct vmd_vm *); 96 int dump_send_header(int); 97 static int dump_vmr(int , struct vm_mem_range *); 98 static int dump_mem(int, struct vmd_vm *); 99 void restore_vmr(int, struct vm_mem_range *); 100 void restore_mem(int, struct vm_create_params *); 101 int restore_vm_params(int, struct vm_create_params *); 102 static void pause_vm(struct vmd_vm *); 103 static void unpause_vm(struct vmd_vm *); 104 105 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int); 106 107 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, 108 size_t); 109 110 int con_fd; 111 struct vmd_vm *current_vm; 112 113 extern struct vmd *env; 114 115 extern char *__progname; 116 117 pthread_mutex_t threadmutex; 118 pthread_cond_t threadcond; 119 120 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM]; 121 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM]; 122 pthread_barrier_t vm_pause_barrier; 123 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM]; 124 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM]; 125 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM]; 126 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM]; 127 128 /* 129 * Represents a standard register set for an OS to be booted 130 * as a flat 64 bit address space. 131 * 132 * NOT set here are: 133 * RIP 134 * RSP 135 * GDTR BASE 136 * 137 * Specific bootloaders should clone this structure and override 138 * those fields as needed. 139 * 140 * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on 141 * features of the CPU in use. 142 */ 143 static const struct vcpu_reg_state vcpu_init_flat64 = { 144 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 145 .vrs_gprs[VCPU_REGS_RIP] = 0x0, 146 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 147 .vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG, 148 .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE, 149 .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE, 150 .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL, 151 .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL, 152 .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL, 153 .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL, 154 .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, 155 .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 156 .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 157 .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 158 .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 159 .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 160 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 161 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 162 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 163 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 164 .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA, 165 .vrs_drs[VCPU_REGS_DR0] = 0x0, 166 .vrs_drs[VCPU_REGS_DR1] = 0x0, 167 .vrs_drs[VCPU_REGS_DR2] = 0x0, 168 .vrs_drs[VCPU_REGS_DR3] = 0x0, 169 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 170 .vrs_drs[VCPU_REGS_DR7] = 0x400, 171 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 172 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 173 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 174 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 175 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 176 .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL, 177 .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87 178 }; 179 180 /* 181 * Represents a standard register set for an BIOS to be booted 182 * as a flat 16 bit address space. 183 */ 184 static const struct vcpu_reg_state vcpu_init_flat16 = { 185 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 186 .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0, 187 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 188 .vrs_crs[VCPU_REGS_CR0] = 0x60000010, 189 .vrs_crs[VCPU_REGS_CR3] = 0, 190 .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000}, 191 .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 192 .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0}, 193 .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 194 .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 195 .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 196 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 197 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 198 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 199 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 200 .vrs_msrs[VCPU_REGS_EFER] = 0ULL, 201 .vrs_drs[VCPU_REGS_DR0] = 0x0, 202 .vrs_drs[VCPU_REGS_DR1] = 0x0, 203 .vrs_drs[VCPU_REGS_DR2] = 0x0, 204 .vrs_drs[VCPU_REGS_DR3] = 0x0, 205 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 206 .vrs_drs[VCPU_REGS_DR7] = 0x400, 207 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 208 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 209 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 210 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 211 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 212 .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87 213 }; 214 215 /* 216 * vm_main 217 * 218 * Primary entrypoint for launching a vm. Does not return. 219 * 220 * fd: file descriptor for communicating with vmm process. 221 */ 222 void 223 vm_main(int fd) 224 { 225 struct vm_create_params *vcp = NULL; 226 struct vmd_vm vm; 227 size_t sz = 0; 228 int ret = 0; 229 230 /* 231 * We aren't root, so we can't chroot(2). Use unveil(2) instead. 232 */ 233 if (unveil(env->argv0, "x") == -1) 234 fatal("unveil %s", env->argv0); 235 if (unveil(NULL, NULL) == -1) 236 fatal("unveil lock"); 237 238 /* 239 * pledge in the vm processes: 240 * stdio - for malloc and basic I/O including events. 241 * vmm - for the vmm ioctls and operations. 242 * proc exec - fork/exec for launching devices. 243 * recvfd - for vm send/recv and sending fd to devices. 244 * tmppath/rpath - for shm_mkstemp, ftruncate, unlink 245 */ 246 if (pledge("stdio vmm proc exec recvfd tmppath rpath", NULL) == -1) 247 fatal("pledge"); 248 249 /* Receive our vm configuration. */ 250 memset(&vm, 0, sizeof(vm)); 251 sz = atomicio(read, fd, &vm, sizeof(vm)); 252 if (sz != sizeof(vm)) { 253 log_warnx("failed to receive start message"); 254 _exit(EIO); 255 } 256 257 /* Receive the /dev/vmm fd number. */ 258 sz = atomicio(read, fd, &env->vmd_fd, sizeof(env->vmd_fd)); 259 if (sz != sizeof(env->vmd_fd)) { 260 log_warnx("failed to receive /dev/vmm fd"); 261 _exit(EIO); 262 } 263 264 /* Update process with the vm name. */ 265 vcp = &vm.vm_params.vmc_params; 266 setproctitle("%s", vcp->vcp_name); 267 log_procinit(vcp->vcp_name); 268 269 /* 270 * We need, at minimum, a vm_kernel fd to boot a vm. This is either a 271 * kernel or a BIOS image. 272 */ 273 if (!(vm.vm_state & VM_STATE_RECEIVED)) { 274 if (vm.vm_kernel == -1) { 275 log_warnx("%s: failed to receive boot fd", 276 vcp->vcp_name); 277 _exit(EINVAL); 278 } 279 if (fcntl(vm.vm_kernel, F_SETFL, O_NONBLOCK) == -1) { 280 ret = errno; 281 log_warn("failed to set nonblocking mode on boot fd"); 282 _exit(ret); 283 } 284 } 285 286 ret = start_vm(&vm, fd); 287 _exit(ret); 288 } 289 290 /* 291 * loadfile_bios 292 * 293 * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image 294 * directly into memory. 295 * 296 * Parameters: 297 * fp: file of a kernel file to load 298 * size: uncompressed size of the image 299 * (out) vrs: register state to set on init for this kernel 300 * 301 * Return values: 302 * 0 if successful 303 * various error codes returned from read(2) or loadelf functions 304 */ 305 int 306 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs) 307 { 308 off_t off; 309 310 /* Set up a "flat 16 bit" register state for BIOS */ 311 memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs)); 312 313 /* Seek to the beginning of the BIOS image */ 314 if (gzseek(fp, 0, SEEK_SET) == -1) 315 return (-1); 316 317 /* The BIOS image must end at 1MB */ 318 if ((off = MB(1) - size) < 0) 319 return (-1); 320 321 /* Read BIOS image into memory */ 322 if (mread(fp, off, size) != (size_t)size) { 323 errno = EIO; 324 return (-1); 325 } 326 327 if (gzseek(fp, 0, SEEK_SET) == -1) 328 return (-1); 329 330 /* Read a second BIOS copy into memory ending at 4GB */ 331 off = GB(4) - size; 332 if (mread(fp, off, size) != (size_t)size) { 333 errno = EIO; 334 return (-1); 335 } 336 337 log_debug("%s: loaded BIOS image", __func__); 338 339 return (0); 340 } 341 342 /* 343 * start_vm 344 * 345 * After forking a new VM process, starts the new VM with the creation 346 * parameters supplied (in the incoming vm->vm_params field). This 347 * function performs a basic sanity check on the incoming parameters 348 * and then performs the following steps to complete the creation of the VM: 349 * 350 * 1. validates and create the new VM 351 * 2. opens the imsg control channel to the parent and drops more privilege 352 * 3. drops additional privileges by calling pledge(2) 353 * 4. loads the kernel from the disk image or file descriptor 354 * 5. runs the VM's VCPU loops. 355 * 356 * Parameters: 357 * vm: The VM data structure that is including the VM create parameters. 358 * fd: The imsg socket that is connected to the parent process. 359 * 360 * Return values: 361 * 0: success 362 * !0 : failure - typically an errno indicating the source of the failure 363 */ 364 int 365 start_vm(struct vmd_vm *vm, int fd) 366 { 367 struct vmop_create_params *vmc = &vm->vm_params; 368 struct vm_create_params *vcp = &vmc->vmc_params; 369 struct vcpu_reg_state vrs; 370 int nicfds[VM_MAX_NICS_PER_VM]; 371 int ret; 372 gzFile fp; 373 size_t i; 374 struct vm_rwregs_params vrp; 375 struct stat sb; 376 377 /* 378 * We first try to initialize and allocate memory before bothering 379 * vmm(4) with a request to create a new vm. 380 */ 381 if (!(vm->vm_state & VM_STATE_RECEIVED)) 382 create_memory_map(vcp); 383 384 ret = alloc_guest_mem(vm); 385 if (ret) { 386 struct rlimit lim; 387 char buf[FMT_SCALED_STRSIZE]; 388 if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) { 389 if (fmt_scaled(lim.rlim_cur, buf) == 0) 390 fatalx("could not allocate guest memory (data " 391 "limit is %s)", buf); 392 } 393 errno = ret; 394 log_warn("could not allocate guest memory"); 395 return (ret); 396 } 397 398 /* We've allocated guest memory, so now create the vm in vmm(4). */ 399 ret = vmm_create_vm(vm); 400 if (ret) { 401 /* Let the vmm process know we failed by sending a 0 vm id. */ 402 vcp->vcp_id = 0; 403 atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)); 404 return (ret); 405 } 406 407 /* 408 * Some of vmd currently relies on global state (current_vm, con_fd). 409 */ 410 current_vm = vm; 411 con_fd = vm->vm_tty; 412 if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) { 413 log_warn("failed to set nonblocking mode on console"); 414 return (1); 415 } 416 417 /* 418 * We now let the vmm process know we were successful by sending it our 419 * vmm(4) assigned vm id. 420 */ 421 if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != 422 sizeof(vcp->vcp_id)) { 423 log_warn("failed to send created vm id to vmm process"); 424 return (1); 425 } 426 427 /* Prepare either our boot image or receive an existing vm to launch. */ 428 if (vm->vm_state & VM_STATE_RECEIVED) { 429 ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp)); 430 if (ret != sizeof(vrp)) 431 fatal("received incomplete vrp - exiting"); 432 vrs = vrp.vrwp_regs; 433 } else { 434 /* 435 * Set up default "flat 64 bit" register state - RIP, 436 * RSP, and GDT info will be set in bootloader 437 */ 438 memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs)); 439 440 /* Find and open kernel image */ 441 if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL) 442 fatalx("failed to open kernel - exiting"); 443 444 /* Load kernel image */ 445 ret = loadfile_elf(fp, vm, &vrs, vmc->vmc_bootdevice); 446 447 /* 448 * Try BIOS as a fallback (only if it was provided as an image 449 * with vm->vm_kernel and the file is not compressed) 450 */ 451 if (ret && errno == ENOEXEC && vm->vm_kernel != -1 && 452 gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0) 453 ret = loadfile_bios(fp, sb.st_size, &vrs); 454 455 if (ret) 456 fatal("failed to load kernel or BIOS - exiting"); 457 458 gzclose(fp); 459 } 460 461 if (vm->vm_kernel != -1) 462 close_fd(vm->vm_kernel); 463 464 /* Initialize our mutexes. */ 465 ret = pthread_mutex_init(&threadmutex, NULL); 466 if (ret) { 467 log_warn("%s: could not initialize thread state mutex", 468 __func__); 469 return (ret); 470 } 471 ret = pthread_cond_init(&threadcond, NULL); 472 if (ret) { 473 log_warn("%s: could not initialize thread state " 474 "condition variable", __func__); 475 return (ret); 476 } 477 mutex_lock(&threadmutex); 478 479 480 /* 481 * Finalize our communication socket with the vmm process. From here 482 * onwards, communication with the vmm process is event-based. 483 */ 484 event_init(); 485 if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) 486 fatal("setup vm pipe"); 487 488 /* 489 * Initialize or restore our emulated hardware. 490 */ 491 for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) 492 nicfds[i] = vm->vm_ifs[i].vif_fd; 493 494 if (vm->vm_state & VM_STATE_RECEIVED) { 495 restore_mem(vm->vm_receive_fd, vcp); 496 restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds, 497 vm->vm_disks, vm->vm_cdrom); 498 if (restore_vm_params(vm->vm_receive_fd, vcp)) 499 fatal("restore vm params failed"); 500 unpause_vm(vm); 501 } else 502 init_emulated_hw(vmc, vm->vm_cdrom, vm->vm_disks, nicfds); 503 504 /* Drop privleges further before starting the vcpu run loop(s). */ 505 if (pledge("stdio vmm recvfd", NULL) == -1) 506 fatal("pledge"); 507 508 /* 509 * Execute the vcpu run loop(s) for this VM. 510 */ 511 ret = run_vm(&vm->vm_params, &vrs); 512 513 /* Ensure that any in-flight data is written back */ 514 virtio_shutdown(vm); 515 516 return (ret); 517 } 518 519 /* 520 * vm_dispatch_vmm 521 * 522 * imsg callback for messages that are received from the vmm parent process. 523 */ 524 void 525 vm_dispatch_vmm(int fd, short event, void *arg) 526 { 527 struct vmd_vm *vm = arg; 528 struct vmop_result vmr; 529 struct vmop_addr_result var; 530 struct imsgev *iev = &vm->vm_iev; 531 struct imsgbuf *ibuf = &iev->ibuf; 532 struct imsg imsg; 533 ssize_t n; 534 int verbose; 535 536 if (event & EV_READ) { 537 if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) 538 fatal("%s: imsg_read", __func__); 539 if (n == 0) 540 _exit(0); 541 } 542 543 if (event & EV_WRITE) { 544 if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) 545 fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd); 546 if (n == 0) 547 _exit(0); 548 } 549 550 for (;;) { 551 if ((n = imsg_get(ibuf, &imsg)) == -1) 552 fatal("%s: imsg_get", __func__); 553 if (n == 0) 554 break; 555 556 #if DEBUG > 1 557 log_debug("%s: got imsg %d from %s", 558 __func__, imsg.hdr.type, 559 vm->vm_params.vmc_params.vcp_name); 560 #endif 561 562 switch (imsg.hdr.type) { 563 case IMSG_CTL_VERBOSE: 564 IMSG_SIZE_CHECK(&imsg, &verbose); 565 memcpy(&verbose, imsg.data, sizeof(verbose)); 566 log_setverbose(verbose); 567 break; 568 case IMSG_VMDOP_VM_SHUTDOWN: 569 if (vmmci_ctl(VMMCI_SHUTDOWN) == -1) 570 _exit(0); 571 break; 572 case IMSG_VMDOP_VM_REBOOT: 573 if (vmmci_ctl(VMMCI_REBOOT) == -1) 574 _exit(0); 575 break; 576 case IMSG_VMDOP_PAUSE_VM: 577 vmr.vmr_result = 0; 578 vmr.vmr_id = vm->vm_vmid; 579 pause_vm(vm); 580 imsg_compose_event(&vm->vm_iev, 581 IMSG_VMDOP_PAUSE_VM_RESPONSE, 582 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 583 sizeof(vmr)); 584 break; 585 case IMSG_VMDOP_UNPAUSE_VM: 586 vmr.vmr_result = 0; 587 vmr.vmr_id = vm->vm_vmid; 588 unpause_vm(vm); 589 imsg_compose_event(&vm->vm_iev, 590 IMSG_VMDOP_UNPAUSE_VM_RESPONSE, 591 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 592 sizeof(vmr)); 593 break; 594 case IMSG_VMDOP_SEND_VM_REQUEST: 595 vmr.vmr_id = vm->vm_vmid; 596 vmr.vmr_result = send_vm(imsg.fd, vm); 597 imsg_compose_event(&vm->vm_iev, 598 IMSG_VMDOP_SEND_VM_RESPONSE, 599 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 600 sizeof(vmr)); 601 if (!vmr.vmr_result) { 602 imsg_flush(¤t_vm->vm_iev.ibuf); 603 _exit(0); 604 } 605 break; 606 case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE: 607 IMSG_SIZE_CHECK(&imsg, &var); 608 memcpy(&var, imsg.data, sizeof(var)); 609 610 log_debug("%s: received tap addr %s for nic %d", 611 vm->vm_params.vmc_params.vcp_name, 612 ether_ntoa((void *)var.var_addr), var.var_nic_idx); 613 614 vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr); 615 break; 616 default: 617 fatalx("%s: got invalid imsg %d from %s", 618 __func__, imsg.hdr.type, 619 vm->vm_params.vmc_params.vcp_name); 620 } 621 imsg_free(&imsg); 622 } 623 imsg_event_add(iev); 624 } 625 626 /* 627 * vm_shutdown 628 * 629 * Tell the vmm parent process to shutdown or reboot the VM and exit. 630 */ 631 __dead void 632 vm_shutdown(unsigned int cmd) 633 { 634 switch (cmd) { 635 case VMMCI_NONE: 636 case VMMCI_SHUTDOWN: 637 (void)imsg_compose_event(¤t_vm->vm_iev, 638 IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0); 639 break; 640 case VMMCI_REBOOT: 641 (void)imsg_compose_event(¤t_vm->vm_iev, 642 IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0); 643 break; 644 default: 645 fatalx("invalid vm ctl command: %d", cmd); 646 } 647 imsg_flush(¤t_vm->vm_iev.ibuf); 648 649 _exit(0); 650 } 651 652 int 653 send_vm(int fd, struct vmd_vm *vm) 654 { 655 struct vm_rwregs_params vrp; 656 struct vm_rwvmparams_params vpp; 657 struct vmop_create_params *vmc; 658 struct vm_terminate_params vtp; 659 unsigned int flags = 0; 660 unsigned int i; 661 int ret = 0; 662 size_t sz; 663 664 if (dump_send_header(fd)) { 665 log_warnx("%s: failed to send vm dump header", __func__); 666 goto err; 667 } 668 669 pause_vm(vm); 670 671 vmc = calloc(1, sizeof(struct vmop_create_params)); 672 if (vmc == NULL) { 673 log_warn("%s: calloc error getting vmc", __func__); 674 ret = -1; 675 goto err; 676 } 677 678 flags |= VMOP_CREATE_MEMORY; 679 memcpy(&vmc->vmc_params, ¤t_vm->vm_params, sizeof(struct 680 vmop_create_params)); 681 vmc->vmc_flags = flags; 682 vrp.vrwp_vm_id = vm->vm_params.vmc_params.vcp_id; 683 vrp.vrwp_mask = VM_RWREGS_ALL; 684 vpp.vpp_mask = VM_RWVMPARAMS_ALL; 685 vpp.vpp_vm_id = vm->vm_params.vmc_params.vcp_id; 686 687 sz = atomicio(vwrite, fd, vmc, sizeof(struct vmop_create_params)); 688 if (sz != sizeof(struct vmop_create_params)) { 689 ret = -1; 690 goto err; 691 } 692 693 for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) { 694 vrp.vrwp_vcpu_id = i; 695 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) { 696 log_warn("%s: readregs failed", __func__); 697 goto err; 698 } 699 700 sz = atomicio(vwrite, fd, &vrp, 701 sizeof(struct vm_rwregs_params)); 702 if (sz != sizeof(struct vm_rwregs_params)) { 703 log_warn("%s: dumping registers failed", __func__); 704 ret = -1; 705 goto err; 706 } 707 } 708 709 /* Dump memory before devices to aid in restoration. */ 710 if ((ret = dump_mem(fd, vm))) 711 goto err; 712 if ((ret = i8253_dump(fd))) 713 goto err; 714 if ((ret = i8259_dump(fd))) 715 goto err; 716 if ((ret = ns8250_dump(fd))) 717 goto err; 718 if ((ret = mc146818_dump(fd))) 719 goto err; 720 if ((ret = fw_cfg_dump(fd))) 721 goto err; 722 if ((ret = pci_dump(fd))) 723 goto err; 724 if ((ret = virtio_dump(fd))) 725 goto err; 726 727 for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) { 728 vpp.vpp_vcpu_id = i; 729 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) { 730 log_warn("%s: readvmparams failed", __func__); 731 goto err; 732 } 733 734 sz = atomicio(vwrite, fd, &vpp, 735 sizeof(struct vm_rwvmparams_params)); 736 if (sz != sizeof(struct vm_rwvmparams_params)) { 737 log_warn("%s: dumping vm params failed", __func__); 738 ret = -1; 739 goto err; 740 } 741 } 742 743 vtp.vtp_vm_id = vm->vm_params.vmc_params.vcp_id; 744 if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) { 745 log_warnx("%s: term IOC error: %d, %d", __func__, 746 errno, ENOENT); 747 } 748 err: 749 close(fd); 750 if (ret) 751 unpause_vm(vm); 752 return ret; 753 } 754 755 int 756 dump_send_header(int fd) { 757 struct vm_dump_header vmh; 758 int i; 759 760 memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE, 761 sizeof(vmh.vmh_signature)); 762 763 vmh.vmh_cpuids[0].code = 0x00; 764 vmh.vmh_cpuids[0].leaf = 0x00; 765 766 vmh.vmh_cpuids[1].code = 0x01; 767 vmh.vmh_cpuids[1].leaf = 0x00; 768 769 vmh.vmh_cpuids[2].code = 0x07; 770 vmh.vmh_cpuids[2].leaf = 0x00; 771 772 vmh.vmh_cpuids[3].code = 0x0d; 773 vmh.vmh_cpuids[3].leaf = 0x00; 774 775 vmh.vmh_cpuids[4].code = 0x80000001; 776 vmh.vmh_cpuids[4].leaf = 0x00; 777 778 vmh.vmh_version = VM_DUMP_VERSION; 779 780 for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { 781 CPUID_LEAF(vmh.vmh_cpuids[i].code, 782 vmh.vmh_cpuids[i].leaf, 783 vmh.vmh_cpuids[i].a, 784 vmh.vmh_cpuids[i].b, 785 vmh.vmh_cpuids[i].c, 786 vmh.vmh_cpuids[i].d); 787 } 788 789 if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh)) 790 return (-1); 791 792 return (0); 793 } 794 795 int 796 dump_mem(int fd, struct vmd_vm *vm) 797 { 798 unsigned int i; 799 int ret; 800 struct vm_mem_range *vmr; 801 802 for (i = 0; i < vm->vm_params.vmc_params.vcp_nmemranges; i++) { 803 vmr = &vm->vm_params.vmc_params.vcp_memranges[i]; 804 ret = dump_vmr(fd, vmr); 805 if (ret) 806 return ret; 807 } 808 return (0); 809 } 810 811 int 812 restore_vm_params(int fd, struct vm_create_params *vcp) { 813 unsigned int i; 814 struct vm_rwvmparams_params vpp; 815 816 for (i = 0; i < vcp->vcp_ncpus; i++) { 817 if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) { 818 log_warn("%s: error restoring vm params", __func__); 819 return (-1); 820 } 821 vpp.vpp_vm_id = vcp->vcp_id; 822 vpp.vpp_vcpu_id = i; 823 if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) { 824 log_debug("%s: writing vm params failed", __func__); 825 return (-1); 826 } 827 } 828 return (0); 829 } 830 831 void 832 restore_mem(int fd, struct vm_create_params *vcp) 833 { 834 unsigned int i; 835 struct vm_mem_range *vmr; 836 837 for (i = 0; i < vcp->vcp_nmemranges; i++) { 838 vmr = &vcp->vcp_memranges[i]; 839 restore_vmr(fd, vmr); 840 } 841 } 842 843 int 844 dump_vmr(int fd, struct vm_mem_range *vmr) 845 { 846 size_t rem = vmr->vmr_size, read=0; 847 char buf[PAGE_SIZE]; 848 849 while (rem > 0) { 850 if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) { 851 log_warn("failed to read vmr"); 852 return (-1); 853 } 854 if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) { 855 log_warn("failed to dump vmr"); 856 return (-1); 857 } 858 rem = rem - PAGE_SIZE; 859 read = read + PAGE_SIZE; 860 } 861 return (0); 862 } 863 864 void 865 restore_vmr(int fd, struct vm_mem_range *vmr) 866 { 867 size_t rem = vmr->vmr_size, wrote=0; 868 char buf[PAGE_SIZE]; 869 870 while (rem > 0) { 871 if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf)) 872 fatal("failed to restore vmr"); 873 if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE)) 874 fatal("failed to write vmr"); 875 rem = rem - PAGE_SIZE; 876 wrote = wrote + PAGE_SIZE; 877 } 878 } 879 880 static void 881 pause_vm(struct vmd_vm *vm) 882 { 883 unsigned int n; 884 int ret; 885 if (vm->vm_state & VM_STATE_PAUSED) 886 return; 887 888 current_vm->vm_state |= VM_STATE_PAUSED; 889 890 ret = pthread_barrier_init(&vm_pause_barrier, NULL, 891 vm->vm_params.vmc_params.vcp_ncpus + 1); 892 if (ret) { 893 log_warnx("%s: cannot initialize pause barrier (%d)", 894 __progname, ret); 895 return; 896 } 897 898 for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) { 899 ret = pthread_cond_broadcast(&vcpu_run_cond[n]); 900 if (ret) { 901 log_warnx("%s: can't broadcast vcpu run cond (%d)", 902 __func__, (int)ret); 903 return; 904 } 905 } 906 ret = pthread_barrier_wait(&vm_pause_barrier); 907 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 908 log_warnx("%s: could not wait on pause barrier (%d)", 909 __func__, (int)ret); 910 return; 911 } 912 913 ret = pthread_barrier_destroy(&vm_pause_barrier); 914 if (ret) { 915 log_warnx("%s: could not destroy pause barrier (%d)", 916 __progname, ret); 917 return; 918 } 919 920 i8253_stop(); 921 mc146818_stop(); 922 ns8250_stop(); 923 virtio_stop(vm); 924 } 925 926 static void 927 unpause_vm(struct vmd_vm *vm) 928 { 929 unsigned int n; 930 int ret; 931 if (!(vm->vm_state & VM_STATE_PAUSED)) 932 return; 933 934 current_vm->vm_state &= ~VM_STATE_PAUSED; 935 for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) { 936 ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]); 937 if (ret) { 938 log_warnx("%s: can't broadcast vcpu unpause cond (%d)", 939 __func__, (int)ret); 940 return; 941 } 942 } 943 944 i8253_start(); 945 mc146818_start(); 946 ns8250_start(); 947 virtio_start(vm); 948 } 949 950 /* 951 * vcpu_reset 952 * 953 * Requests vmm(4) to reset the VCPUs in the indicated VM to 954 * the register state provided 955 * 956 * Parameters 957 * vmid: VM ID to reset 958 * vcpu_id: VCPU ID to reset 959 * vrs: the register state to initialize 960 * 961 * Return values: 962 * 0: success 963 * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not 964 * valid) 965 */ 966 int 967 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) 968 { 969 struct vm_resetcpu_params vrp; 970 971 memset(&vrp, 0, sizeof(vrp)); 972 vrp.vrp_vm_id = vmid; 973 vrp.vrp_vcpu_id = vcpu_id; 974 memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state)); 975 976 log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid); 977 978 if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1) 979 return (errno); 980 981 return (0); 982 } 983 984 /* 985 * create_memory_map 986 * 987 * Sets up the guest physical memory ranges that the VM can access. 988 * 989 * Parameters: 990 * vcp: VM create parameters describing the VM whose memory map 991 * is being created 992 * 993 * Return values: 994 * nothing 995 */ 996 void 997 create_memory_map(struct vm_create_params *vcp) 998 { 999 size_t len, mem_bytes; 1000 size_t above_1m = 0, above_4g = 0; 1001 1002 mem_bytes = vcp->vcp_memranges[0].vmr_size; 1003 vcp->vcp_nmemranges = 0; 1004 if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE) 1005 return; 1006 1007 /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ 1008 len = LOWMEM_KB * 1024; 1009 vcp->vcp_memranges[0].vmr_gpa = 0x0; 1010 vcp->vcp_memranges[0].vmr_size = len; 1011 vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM; 1012 mem_bytes -= len; 1013 1014 /* 1015 * Second memory region: LOWMEM_KB - 1MB. 1016 * 1017 * N.B. - Normally ROMs or parts of video RAM are mapped here. 1018 * We have to add this region, because some systems 1019 * unconditionally write to 0xb8000 (VGA RAM), and 1020 * we need to make sure that vmm(4) permits accesses 1021 * to it. So allocate guest memory for it. 1022 */ 1023 len = MB(1) - (LOWMEM_KB * 1024); 1024 vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; 1025 vcp->vcp_memranges[1].vmr_size = len; 1026 vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED; 1027 mem_bytes -= len; 1028 1029 /* If we have less than 2MB remaining, still create a 2nd BIOS area. */ 1030 if (mem_bytes <= MB(2)) { 1031 vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END; 1032 vcp->vcp_memranges[2].vmr_size = MB(2); 1033 vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED; 1034 vcp->vcp_nmemranges = 3; 1035 return; 1036 } 1037 1038 /* 1039 * Calculate the how to split any remaining memory across the 4GB 1040 * boundary while making sure we do not place physical memory into 1041 * MMIO ranges. 1042 */ 1043 if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) { 1044 above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1); 1045 above_4g = mem_bytes - above_1m; 1046 } else { 1047 above_1m = mem_bytes; 1048 above_4g = 0; 1049 } 1050 1051 /* Third memory region: area above 1MB to MMIO region */ 1052 vcp->vcp_memranges[2].vmr_gpa = MB(1); 1053 vcp->vcp_memranges[2].vmr_size = above_1m; 1054 vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM; 1055 1056 /* Fourth region: PCI MMIO range */ 1057 vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_BASE; 1058 vcp->vcp_memranges[3].vmr_size = VMM_PCI_MMIO_BAR_END - 1059 VMM_PCI_MMIO_BAR_BASE + 1; 1060 vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO; 1061 1062 /* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */ 1063 vcp->vcp_memranges[4].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; 1064 vcp->vcp_memranges[4].vmr_size = MB(2); 1065 vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED; 1066 1067 /* Sixth region: any remainder above 4GB */ 1068 if (above_4g > 0) { 1069 vcp->vcp_memranges[5].vmr_gpa = GB(4); 1070 vcp->vcp_memranges[5].vmr_size = above_4g; 1071 vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM; 1072 vcp->vcp_nmemranges = 6; 1073 } else 1074 vcp->vcp_nmemranges = 5; 1075 } 1076 1077 /* 1078 * alloc_guest_mem 1079 * 1080 * Allocates memory for the guest. 1081 * Instead of doing a single allocation with one mmap(), we allocate memory 1082 * separately for every range for the following reasons: 1083 * - ASLR for the individual ranges 1084 * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to 1085 * map the single mmap'd userspace memory to the individual guest physical 1086 * memory ranges, the underlying amap of the single mmap'd range would have 1087 * to allocate per-page reference counters. The reason is that the 1088 * individual guest physical ranges would reference the single mmap'd region 1089 * only partially. However, if every guest physical range has its own 1090 * corresponding mmap'd userspace allocation, there are no partial 1091 * references: every guest physical range fully references an mmap'd 1092 * range => no per-page reference counters have to be allocated. 1093 * 1094 * Return values: 1095 * 0: success 1096 * !0: failure - errno indicating the source of the failure 1097 */ 1098 int 1099 alloc_guest_mem(struct vmd_vm *vm) 1100 { 1101 void *p; 1102 char *tmp; 1103 int fd, ret = 0; 1104 size_t i, j; 1105 struct vm_create_params *vcp = &vm->vm_params.vmc_params; 1106 struct vm_mem_range *vmr; 1107 1108 tmp = calloc(32, sizeof(char)); 1109 if (tmp == NULL) { 1110 ret = errno; 1111 log_warn("%s: calloc", __func__); 1112 return (ret); 1113 } 1114 strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32); 1115 1116 vm->vm_nmemfds = vcp->vcp_nmemranges; 1117 1118 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1119 vmr = &vcp->vcp_memranges[i]; 1120 1121 fd = shm_mkstemp(tmp); 1122 if (fd < 0) { 1123 ret = errno; 1124 log_warn("%s: shm_mkstemp", __func__); 1125 return (ret); 1126 } 1127 if (ftruncate(fd, vmr->vmr_size) == -1) { 1128 ret = errno; 1129 log_warn("%s: ftruncate", __func__); 1130 goto out; 1131 } 1132 if (fcntl(fd, F_SETFD, 0) == -1) { 1133 ret = errno; 1134 log_warn("%s: fcntl", __func__); 1135 goto out; 1136 } 1137 if (shm_unlink(tmp) == -1) { 1138 ret = errno; 1139 log_warn("%s: shm_unlink", __func__); 1140 goto out; 1141 } 1142 strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32); 1143 1144 p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, 1145 MAP_SHARED | MAP_CONCEAL, fd, 0); 1146 if (p == MAP_FAILED) { 1147 ret = errno; 1148 for (j = 0; j < i; j++) { 1149 vmr = &vcp->vcp_memranges[j]; 1150 munmap((void *)vmr->vmr_va, vmr->vmr_size); 1151 } 1152 goto out; 1153 } 1154 vm->vm_memfds[i] = fd; 1155 vmr->vmr_va = (vaddr_t)p; 1156 } 1157 out: 1158 free(tmp); 1159 return (ret); 1160 } 1161 1162 /* 1163 * vmm_create_vm 1164 * 1165 * Requests vmm(4) to create a new VM using the supplied creation 1166 * parameters. This operation results in the creation of the in-kernel 1167 * structures for the VM, but does not start the VM's vcpu(s). 1168 * 1169 * Parameters: 1170 * vm: pointer to the vm object 1171 * 1172 * Return values: 1173 * 0: success 1174 * !0 : ioctl to vmm(4) failed 1175 */ 1176 static int 1177 vmm_create_vm(struct vmd_vm *vm) 1178 { 1179 struct vm_create_params *vcp = &vm->vm_params.vmc_params; 1180 1181 /* Sanity check arguments */ 1182 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 1183 return (EINVAL); 1184 1185 if (vcp->vcp_nmemranges == 0 || 1186 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1187 return (EINVAL); 1188 1189 if (vm->vm_params.vmc_ndisks > VM_MAX_DISKS_PER_VM) 1190 return (EINVAL); 1191 1192 if (vm->vm_params.vmc_nnics > VM_MAX_NICS_PER_VM) 1193 return (EINVAL); 1194 1195 if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1) 1196 return (errno); 1197 1198 return (0); 1199 } 1200 1201 /* 1202 * init_emulated_hw 1203 * 1204 * Initializes the userspace hardware emulation 1205 */ 1206 void 1207 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, 1208 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 1209 { 1210 struct vm_create_params *vcp = &vmc->vmc_params; 1211 size_t i; 1212 uint64_t memlo, memhi; 1213 1214 /* Calculate memory size for NVRAM registers */ 1215 memlo = memhi = 0; 1216 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1217 if (vcp->vcp_memranges[i].vmr_gpa == MB(1) && 1218 vcp->vcp_memranges[i].vmr_size > (15 * MB(1))) 1219 memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1)); 1220 else if (vcp->vcp_memranges[i].vmr_gpa == GB(4)) 1221 memhi = vcp->vcp_memranges[i].vmr_size; 1222 } 1223 1224 /* Reset the IO port map */ 1225 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1226 1227 /* Init i8253 PIT */ 1228 i8253_init(vcp->vcp_id); 1229 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1230 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1231 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1232 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1233 ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc; 1234 1235 /* Init mc146818 RTC */ 1236 mc146818_init(vcp->vcp_id, memlo, memhi); 1237 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1238 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1239 1240 /* Init master and slave PICs */ 1241 i8259_init(); 1242 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1243 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1244 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1245 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1246 ioports_map[ELCR0] = vcpu_exit_elcr; 1247 ioports_map[ELCR1] = vcpu_exit_elcr; 1248 1249 /* Init ns8250 UART */ 1250 ns8250_init(con_fd, vcp->vcp_id); 1251 for (i = COM1_DATA; i <= COM1_SCR; i++) 1252 ioports_map[i] = vcpu_exit_com; 1253 1254 /* Initialize PCI */ 1255 for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++) 1256 ioports_map[i] = vcpu_exit_pci; 1257 1258 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1259 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1260 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1261 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1262 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1263 pci_init(); 1264 1265 /* Initialize virtio devices */ 1266 virtio_init(current_vm, child_cdrom, child_disks, child_taps); 1267 1268 /* 1269 * Init QEMU fw_cfg interface. Must be done last for pci hardware 1270 * detection. 1271 */ 1272 fw_cfg_init(vmc); 1273 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1274 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1275 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1276 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1277 } 1278 1279 /* 1280 * restore_emulated_hw 1281 * 1282 * Restores the userspace hardware emulation from fd 1283 */ 1284 void 1285 restore_emulated_hw(struct vm_create_params *vcp, int fd, 1286 int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom) 1287 { 1288 /* struct vm_create_params *vcp = &vmc->vmc_params; */ 1289 int i; 1290 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1291 1292 /* Init i8253 PIT */ 1293 i8253_restore(fd, vcp->vcp_id); 1294 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1295 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1296 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1297 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1298 1299 /* Init master and slave PICs */ 1300 i8259_restore(fd); 1301 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1302 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1303 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1304 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1305 1306 /* Init ns8250 UART */ 1307 ns8250_restore(fd, con_fd, vcp->vcp_id); 1308 for (i = COM1_DATA; i <= COM1_SCR; i++) 1309 ioports_map[i] = vcpu_exit_com; 1310 1311 /* Init mc146818 RTC */ 1312 mc146818_restore(fd, vcp->vcp_id); 1313 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1314 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1315 1316 /* Init QEMU fw_cfg interface */ 1317 fw_cfg_restore(fd); 1318 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1319 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1320 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1321 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1322 1323 /* Initialize PCI */ 1324 for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++) 1325 ioports_map[i] = vcpu_exit_pci; 1326 1327 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1328 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1329 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1330 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1331 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1332 pci_restore(fd); 1333 virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps); 1334 } 1335 1336 /* 1337 * run_vm 1338 * 1339 * Runs the VM whose creation parameters are specified in vcp 1340 * 1341 * Parameters: 1342 * child_cdrom: previously-opened child ISO disk file descriptor 1343 * child_disks: previously-opened child VM disk file file descriptors 1344 * child_taps: previously-opened child tap file descriptors 1345 * vmc: vmop_create_params struct containing the VM's desired creation 1346 * configuration 1347 * vrs: VCPU register state to initialize 1348 * 1349 * Return values: 1350 * 0: the VM exited normally 1351 * !0 : the VM exited abnormally or failed to start 1352 */ 1353 static int 1354 run_vm(struct vmop_create_params *vmc, struct vcpu_reg_state *vrs) 1355 { 1356 struct vm_create_params *vcp = &vmc->vmc_params; 1357 struct vm_rwregs_params vregsp; 1358 uint8_t evdone = 0; 1359 size_t i; 1360 int ret; 1361 pthread_t *tid, evtid; 1362 char tname[MAXCOMLEN + 1]; 1363 struct vm_run_params **vrp; 1364 void *exit_status; 1365 1366 if (vcp == NULL) 1367 return (EINVAL); 1368 1369 if (vcp->vcp_nmemranges == 0 || 1370 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1371 return (EINVAL); 1372 1373 tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t)); 1374 vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *)); 1375 if (tid == NULL || vrp == NULL) { 1376 log_warn("%s: memory allocation error - exiting.", 1377 __progname); 1378 return (ENOMEM); 1379 } 1380 1381 log_debug("%s: starting %zu vcpu thread(s) for vm %s", __func__, 1382 vcp->vcp_ncpus, vcp->vcp_name); 1383 1384 /* 1385 * Create and launch one thread for each VCPU. These threads may 1386 * migrate between PCPUs over time; the need to reload CPU state 1387 * in such situations is detected and performed by vmm(4) in the 1388 * kernel. 1389 */ 1390 for (i = 0 ; i < vcp->vcp_ncpus; i++) { 1391 vrp[i] = malloc(sizeof(struct vm_run_params)); 1392 if (vrp[i] == NULL) { 1393 log_warn("%s: memory allocation error - " 1394 "exiting.", __progname); 1395 /* caller will exit, so skip freeing */ 1396 return (ENOMEM); 1397 } 1398 vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit)); 1399 if (vrp[i]->vrp_exit == NULL) { 1400 log_warn("%s: memory allocation error - " 1401 "exiting.", __progname); 1402 /* caller will exit, so skip freeing */ 1403 return (ENOMEM); 1404 } 1405 vrp[i]->vrp_vm_id = vcp->vcp_id; 1406 vrp[i]->vrp_vcpu_id = i; 1407 1408 if (vcpu_reset(vcp->vcp_id, i, vrs)) { 1409 log_warnx("%s: cannot reset VCPU %zu - exiting.", 1410 __progname, i); 1411 return (EIO); 1412 } 1413 1414 /* once more because reset_cpu changes regs */ 1415 if (current_vm->vm_state & VM_STATE_RECEIVED) { 1416 vregsp.vrwp_vm_id = vcp->vcp_id; 1417 vregsp.vrwp_vcpu_id = i; 1418 vregsp.vrwp_regs = *vrs; 1419 vregsp.vrwp_mask = VM_RWREGS_ALL; 1420 if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS, 1421 &vregsp)) == -1) { 1422 log_warn("%s: writeregs failed", __func__); 1423 return (ret); 1424 } 1425 } 1426 1427 ret = pthread_cond_init(&vcpu_run_cond[i], NULL); 1428 if (ret) { 1429 log_warnx("%s: cannot initialize cond var (%d)", 1430 __progname, ret); 1431 return (ret); 1432 } 1433 1434 ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL); 1435 if (ret) { 1436 log_warnx("%s: cannot initialize mtx (%d)", 1437 __progname, ret); 1438 return (ret); 1439 } 1440 1441 ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL); 1442 if (ret) { 1443 log_warnx("%s: cannot initialize unpause var (%d)", 1444 __progname, ret); 1445 return (ret); 1446 } 1447 1448 ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL); 1449 if (ret) { 1450 log_warnx("%s: cannot initialize unpause mtx (%d)", 1451 __progname, ret); 1452 return (ret); 1453 } 1454 1455 vcpu_hlt[i] = 0; 1456 1457 /* Start each VCPU run thread at vcpu_run_loop */ 1458 ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); 1459 if (ret) { 1460 /* caller will _exit after this return */ 1461 ret = errno; 1462 log_warn("%s: could not create vcpu thread %zu", 1463 __func__, i); 1464 return (ret); 1465 } 1466 1467 snprintf(tname, sizeof(tname), "vcpu-%zu", i); 1468 pthread_set_name_np(tid[i], tname); 1469 } 1470 1471 log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name); 1472 ret = pthread_create(&evtid, NULL, event_thread, &evdone); 1473 if (ret) { 1474 errno = ret; 1475 log_warn("%s: could not create event thread", __func__); 1476 return (ret); 1477 } 1478 pthread_set_name_np(evtid, "event"); 1479 1480 for (;;) { 1481 ret = pthread_cond_wait(&threadcond, &threadmutex); 1482 if (ret) { 1483 log_warn("%s: waiting on thread state condition " 1484 "variable failed", __func__); 1485 return (ret); 1486 } 1487 1488 /* 1489 * Did a VCPU thread exit with an error? => return the first one 1490 */ 1491 for (i = 0; i < vcp->vcp_ncpus; i++) { 1492 if (vcpu_done[i] == 0) 1493 continue; 1494 1495 if (pthread_join(tid[i], &exit_status)) { 1496 log_warn("%s: failed to join thread %zd - " 1497 "exiting", __progname, i); 1498 return (EIO); 1499 } 1500 1501 ret = (intptr_t)exit_status; 1502 } 1503 1504 /* Did the event thread exit? => return with an error */ 1505 if (evdone) { 1506 if (pthread_join(evtid, &exit_status)) { 1507 log_warn("%s: failed to join event thread - " 1508 "exiting", __progname); 1509 return (EIO); 1510 } 1511 1512 log_warnx("%s: vm %d event thread exited " 1513 "unexpectedly", __progname, vcp->vcp_id); 1514 return (EIO); 1515 } 1516 1517 /* Did all VCPU threads exit successfully? => return */ 1518 for (i = 0; i < vcp->vcp_ncpus; i++) { 1519 if (vcpu_done[i] == 0) 1520 break; 1521 } 1522 if (i == vcp->vcp_ncpus) 1523 return (ret); 1524 1525 /* Some more threads to wait for, start over */ 1526 } 1527 1528 return (ret); 1529 } 1530 1531 void * 1532 event_thread(void *arg) 1533 { 1534 uint8_t *donep = arg; 1535 intptr_t ret; 1536 1537 ret = event_dispatch(); 1538 1539 mutex_lock(&threadmutex); 1540 *donep = 1; 1541 pthread_cond_signal(&threadcond); 1542 mutex_unlock(&threadmutex); 1543 1544 return (void *)ret; 1545 } 1546 1547 /* 1548 * vcpu_run_loop 1549 * 1550 * Runs a single VCPU until vmm(4) requires help handling an exit, 1551 * or the VM terminates. 1552 * 1553 * Parameters: 1554 * arg: vcpu_run_params for the VCPU being run by this thread 1555 * 1556 * Return values: 1557 * NULL: the VCPU shutdown properly 1558 * !NULL: error processing VCPU run, or the VCPU shutdown abnormally 1559 */ 1560 void * 1561 vcpu_run_loop(void *arg) 1562 { 1563 struct vm_run_params *vrp = (struct vm_run_params *)arg; 1564 intptr_t ret = 0; 1565 int irq; 1566 uint32_t n; 1567 1568 vrp->vrp_continue = 0; 1569 n = vrp->vrp_vcpu_id; 1570 1571 for (;;) { 1572 ret = pthread_mutex_lock(&vcpu_run_mtx[n]); 1573 1574 if (ret) { 1575 log_warnx("%s: can't lock vcpu run mtx (%d)", 1576 __func__, (int)ret); 1577 return ((void *)ret); 1578 } 1579 1580 /* If we are halted and need to pause, pause */ 1581 if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) { 1582 ret = pthread_barrier_wait(&vm_pause_barrier); 1583 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 1584 log_warnx("%s: could not wait on pause barrier (%d)", 1585 __func__, (int)ret); 1586 return ((void *)ret); 1587 } 1588 1589 ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]); 1590 if (ret) { 1591 log_warnx("%s: can't lock vcpu unpause mtx (%d)", 1592 __func__, (int)ret); 1593 return ((void *)ret); 1594 } 1595 1596 ret = pthread_cond_wait(&vcpu_unpause_cond[n], 1597 &vcpu_unpause_mtx[n]); 1598 if (ret) { 1599 log_warnx( 1600 "%s: can't wait on unpause cond (%d)", 1601 __func__, (int)ret); 1602 break; 1603 } 1604 ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]); 1605 if (ret) { 1606 log_warnx("%s: can't unlock unpause mtx (%d)", 1607 __func__, (int)ret); 1608 break; 1609 } 1610 } 1611 1612 /* If we are halted and not paused, wait */ 1613 if (vcpu_hlt[n]) { 1614 ret = pthread_cond_wait(&vcpu_run_cond[n], 1615 &vcpu_run_mtx[n]); 1616 1617 if (ret) { 1618 log_warnx( 1619 "%s: can't wait on cond (%d)", 1620 __func__, (int)ret); 1621 (void)pthread_mutex_unlock( 1622 &vcpu_run_mtx[n]); 1623 break; 1624 } 1625 } 1626 1627 ret = pthread_mutex_unlock(&vcpu_run_mtx[n]); 1628 1629 if (ret) { 1630 log_warnx("%s: can't unlock mutex on cond (%d)", 1631 __func__, (int)ret); 1632 break; 1633 } 1634 1635 if (vrp->vrp_irqready && i8259_is_pending()) { 1636 irq = i8259_ack(); 1637 vrp->vrp_irq = irq; 1638 } else 1639 vrp->vrp_irq = 0xFFFF; 1640 1641 /* Still more pending? */ 1642 if (i8259_is_pending()) { 1643 /* 1644 * XXX can probably avoid ioctls here by providing intr 1645 * in vrp 1646 */ 1647 if (vcpu_pic_intr(vrp->vrp_vm_id, 1648 vrp->vrp_vcpu_id, 1)) { 1649 fatal("can't set INTR"); 1650 } 1651 } else { 1652 if (vcpu_pic_intr(vrp->vrp_vm_id, 1653 vrp->vrp_vcpu_id, 0)) { 1654 fatal("can't clear INTR"); 1655 } 1656 } 1657 1658 if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) { 1659 /* If run ioctl failed, exit */ 1660 ret = errno; 1661 log_warn("%s: vm %d / vcpu %d run ioctl failed", 1662 __func__, vrp->vrp_vm_id, n); 1663 break; 1664 } 1665 1666 /* If the VM is terminating, exit normally */ 1667 if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) { 1668 ret = (intptr_t)NULL; 1669 break; 1670 } 1671 1672 if (vrp->vrp_exit_reason != VM_EXIT_NONE) { 1673 /* 1674 * vmm(4) needs help handling an exit, handle in 1675 * vcpu_exit. 1676 */ 1677 ret = vcpu_exit(vrp); 1678 if (ret) 1679 break; 1680 } 1681 } 1682 1683 mutex_lock(&threadmutex); 1684 vcpu_done[n] = 1; 1685 pthread_cond_signal(&threadcond); 1686 mutex_unlock(&threadmutex); 1687 1688 return ((void *)ret); 1689 } 1690 1691 int 1692 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) 1693 { 1694 struct vm_intr_params vip; 1695 1696 memset(&vip, 0, sizeof(vip)); 1697 1698 vip.vip_vm_id = vm_id; 1699 vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */ 1700 vip.vip_intr = intr; 1701 1702 if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1) 1703 return (errno); 1704 1705 return (0); 1706 } 1707 1708 /* 1709 * vcpu_exit_pci 1710 * 1711 * Handle all I/O to the emulated PCI subsystem. 1712 * 1713 * Parameters: 1714 * vrp: vcpu run parameters containing guest state for this exit 1715 * 1716 * Return value: 1717 * Interrupt to inject to the guest VM, or 0xFF if no interrupt should 1718 * be injected. 1719 */ 1720 uint8_t 1721 vcpu_exit_pci(struct vm_run_params *vrp) 1722 { 1723 struct vm_exit *vei = vrp->vrp_exit; 1724 uint8_t intr; 1725 1726 intr = 0xFF; 1727 1728 switch (vei->vei.vei_port) { 1729 case PCI_MODE1_ADDRESS_REG: 1730 pci_handle_address_reg(vrp); 1731 break; 1732 case PCI_MODE1_DATA_REG: 1733 case PCI_MODE1_DATA_REG + 1: 1734 case PCI_MODE1_DATA_REG + 2: 1735 case PCI_MODE1_DATA_REG + 3: 1736 pci_handle_data_reg(vrp); 1737 break; 1738 case VM_PCI_IO_BAR_BASE ... VM_PCI_IO_BAR_END: 1739 intr = pci_handle_io(vrp); 1740 break; 1741 default: 1742 log_warnx("%s: unknown PCI register 0x%llx", 1743 __progname, (uint64_t)vei->vei.vei_port); 1744 break; 1745 } 1746 1747 return (intr); 1748 } 1749 1750 /* 1751 * vcpu_exit_inout 1752 * 1753 * Handle all I/O exits that need to be emulated in vmd. This includes the 1754 * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. 1755 * 1756 * Parameters: 1757 * vrp: vcpu run parameters containing guest state for this exit 1758 */ 1759 void 1760 vcpu_exit_inout(struct vm_run_params *vrp) 1761 { 1762 struct vm_exit *vei = vrp->vrp_exit; 1763 uint8_t intr = 0xFF; 1764 1765 if (vei->vei.vei_rep || vei->vei.vei_string) { 1766 #ifdef MMIO_DEBUG 1767 log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x", 1768 __func__, 1769 vei->vei.vei_rep == 0 ? "" : "REP ", 1770 vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT", 1771 vei->vei.vei_string == 0 ? "" : "S", 1772 vei->vei.vei_size, vei->vei.vei_encoding, 1773 vei->vei.vei_data, vei->vei.vei_port); 1774 log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx", 1775 __func__, 1776 vei->vrs.vrs_gprs[VCPU_REGS_RCX], 1777 vei->vrs.vrs_gprs[VCPU_REGS_RDX], 1778 vei->vrs.vrs_gprs[VCPU_REGS_RSI]); 1779 #endif /* MMIO_DEBUG */ 1780 fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)", 1781 __func__); 1782 } 1783 1784 if (ioports_map[vei->vei.vei_port] != NULL) 1785 intr = ioports_map[vei->vei.vei_port](vrp); 1786 else if (vei->vei.vei_dir == VEI_DIR_IN) 1787 set_return_data(vei, 0xFFFFFFFF); 1788 1789 if (intr != 0xFF) 1790 vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); 1791 } 1792 1793 /* 1794 * vcpu_exit_eptviolation 1795 * 1796 * handle an EPT Violation 1797 * 1798 * Parameters: 1799 * vrp: vcpu run parameters containing guest state for this exit 1800 * 1801 * Return values: 1802 * 0: no action required 1803 * EFAULT: a protection fault occured, kill the vm. 1804 */ 1805 int 1806 vcpu_exit_eptviolation(struct vm_run_params *vrp) 1807 { 1808 struct vm_exit *ve = vrp->vrp_exit; 1809 int ret = 0; 1810 #if MMIO_NOTYET 1811 struct x86_insn insn; 1812 uint64_t va, pa; 1813 size_t len = 15; /* Max instruction length in x86. */ 1814 #endif /* MMIO_NOTYET */ 1815 switch (ve->vee.vee_fault_type) { 1816 case VEE_FAULT_HANDLED: 1817 log_debug("%s: fault already handled", __func__); 1818 break; 1819 1820 #if MMIO_NOTYET 1821 case VEE_FAULT_MMIO_ASSIST: 1822 /* Intel VMX might give us the length of the instruction. */ 1823 if (ve->vee.vee_insn_info & VEE_LEN_VALID) 1824 len = ve->vee.vee_insn_len; 1825 1826 if (len > 15) 1827 fatalx("%s: invalid instruction length %lu", __func__, 1828 len); 1829 1830 /* If we weren't given instruction bytes, we need to fetch. */ 1831 if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) { 1832 memset(ve->vee.vee_insn_bytes, 0, 1833 sizeof(ve->vee.vee_insn_bytes)); 1834 va = ve->vrs.vrs_gprs[VCPU_REGS_RIP]; 1835 1836 /* XXX Only support instructions that fit on 1 page. */ 1837 if ((va & PAGE_MASK) + len > PAGE_SIZE) { 1838 log_warnx("%s: instruction might cross page " 1839 "boundary", __func__); 1840 ret = EINVAL; 1841 break; 1842 } 1843 1844 ret = translate_gva(ve, va, &pa, PROT_EXEC); 1845 if (ret != 0) { 1846 log_warnx("%s: failed gva translation", 1847 __func__); 1848 break; 1849 } 1850 1851 ret = read_mem(pa, ve->vee.vee_insn_bytes, len); 1852 if (ret != 0) { 1853 log_warnx("%s: failed to fetch instruction " 1854 "bytes from 0x%llx", __func__, pa); 1855 break; 1856 } 1857 } 1858 1859 ret = insn_decode(ve, &insn); 1860 if (ret == 0) 1861 ret = insn_emulate(ve, &insn); 1862 break; 1863 #endif /* MMIO_NOTYET */ 1864 1865 case VEE_FAULT_PROTECT: 1866 log_debug("%s: EPT Violation: rip=0x%llx", __progname, 1867 ve->vrs.vrs_gprs[VCPU_REGS_RIP]); 1868 ret = EFAULT; 1869 break; 1870 1871 default: 1872 fatalx("%s: invalid fault_type %d", __progname, 1873 ve->vee.vee_fault_type); 1874 /* UNREACHED */ 1875 } 1876 1877 return (ret); 1878 } 1879 1880 /* 1881 * vcpu_exit 1882 * 1883 * Handle a vcpu exit. This function is called when it is determined that 1884 * vmm(4) requires the assistance of vmd to support a particular guest 1885 * exit type (eg, accessing an I/O port or device). Guest state is contained 1886 * in 'vrp', and will be resent to vmm(4) on exit completion. 1887 * 1888 * Upon conclusion of handling the exit, the function determines if any 1889 * interrupts should be injected into the guest, and asserts the proper 1890 * IRQ line whose interrupt should be vectored. 1891 * 1892 * Parameters: 1893 * vrp: vcpu run parameters containing guest state for this exit 1894 * 1895 * Return values: 1896 * 0: the exit was handled successfully 1897 * 1: an error occurred (eg, unknown exit reason passed in 'vrp') 1898 */ 1899 int 1900 vcpu_exit(struct vm_run_params *vrp) 1901 { 1902 int ret; 1903 1904 switch (vrp->vrp_exit_reason) { 1905 case VMX_EXIT_INT_WINDOW: 1906 case SVM_VMEXIT_VINTR: 1907 case VMX_EXIT_CPUID: 1908 case VMX_EXIT_EXTINT: 1909 case SVM_VMEXIT_INTR: 1910 case SVM_VMEXIT_MSR: 1911 case SVM_VMEXIT_CPUID: 1912 /* 1913 * We may be exiting to vmd to handle a pending interrupt but 1914 * at the same time the last exit type may have been one of 1915 * these. In this case, there's nothing extra to be done 1916 * here (and falling through to the default case below results 1917 * in more vmd log spam). 1918 */ 1919 break; 1920 case SVM_VMEXIT_NPF: 1921 case VMX_EXIT_EPT_VIOLATION: 1922 ret = vcpu_exit_eptviolation(vrp); 1923 if (ret) 1924 return (ret); 1925 break; 1926 case VMX_EXIT_IO: 1927 case SVM_VMEXIT_IOIO: 1928 vcpu_exit_inout(vrp); 1929 break; 1930 case VMX_EXIT_HLT: 1931 case SVM_VMEXIT_HLT: 1932 ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1933 if (ret) { 1934 log_warnx("%s: can't lock vcpu mutex (%d)", 1935 __func__, ret); 1936 return (ret); 1937 } 1938 vcpu_hlt[vrp->vrp_vcpu_id] = 1; 1939 ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1940 if (ret) { 1941 log_warnx("%s: can't unlock vcpu mutex (%d)", 1942 __func__, ret); 1943 return (ret); 1944 } 1945 break; 1946 case VMX_EXIT_TRIPLE_FAULT: 1947 case SVM_VMEXIT_SHUTDOWN: 1948 /* reset VM */ 1949 return (EAGAIN); 1950 default: 1951 log_debug("%s: unknown exit reason 0x%x", 1952 __progname, vrp->vrp_exit_reason); 1953 } 1954 1955 vrp->vrp_continue = 1; 1956 1957 return (0); 1958 } 1959 1960 /* 1961 * find_gpa_range 1962 * 1963 * Search for a contiguous guest physical mem range. 1964 * 1965 * Parameters: 1966 * vcp: VM create parameters that contain the memory map to search in 1967 * gpa: the starting guest physical address 1968 * len: the length of the memory range 1969 * 1970 * Return values: 1971 * NULL: on failure if there is no memory range as described by the parameters 1972 * Pointer to vm_mem_range that contains the start of the range otherwise. 1973 */ 1974 static struct vm_mem_range * 1975 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) 1976 { 1977 size_t i, n; 1978 struct vm_mem_range *vmr; 1979 1980 /* Find the first vm_mem_range that contains gpa */ 1981 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1982 vmr = &vcp->vcp_memranges[i]; 1983 if (gpa < vmr->vmr_gpa + vmr->vmr_size) 1984 break; 1985 } 1986 1987 /* No range found. */ 1988 if (i == vcp->vcp_nmemranges) 1989 return (NULL); 1990 1991 /* 1992 * vmr may cover the range [gpa, gpa + len) only partly. Make 1993 * sure that the following vm_mem_ranges are contiguous and 1994 * cover the rest. 1995 */ 1996 n = vmr->vmr_size - (gpa - vmr->vmr_gpa); 1997 if (len < n) 1998 len = 0; 1999 else 2000 len -= n; 2001 gpa = vmr->vmr_gpa + vmr->vmr_size; 2002 for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { 2003 vmr = &vcp->vcp_memranges[i]; 2004 if (gpa != vmr->vmr_gpa) 2005 return (NULL); 2006 if (len <= vmr->vmr_size) 2007 len = 0; 2008 else 2009 len -= vmr->vmr_size; 2010 2011 gpa = vmr->vmr_gpa + vmr->vmr_size; 2012 } 2013 2014 if (len != 0) 2015 return (NULL); 2016 2017 return (vmr); 2018 } 2019 2020 /* 2021 * write_mem 2022 * 2023 * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. 2024 * 2025 * Parameters: 2026 * dst: the destination paddr_t in the guest VM 2027 * buf: data to copy (or NULL to zero the data) 2028 * len: number of bytes to copy 2029 * 2030 * Return values: 2031 * 0: success 2032 * EINVAL: if the guest physical memory range [dst, dst + len) does not 2033 * exist in the guest. 2034 */ 2035 int 2036 write_mem(paddr_t dst, const void *buf, size_t len) 2037 { 2038 const char *from = buf; 2039 char *to; 2040 size_t n, off; 2041 struct vm_mem_range *vmr; 2042 2043 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); 2044 if (vmr == NULL) { 2045 errno = EINVAL; 2046 log_warn("%s: failed - invalid memory range dst = 0x%lx, " 2047 "len = 0x%zx", __func__, dst, len); 2048 return (EINVAL); 2049 } 2050 2051 off = dst - vmr->vmr_gpa; 2052 while (len != 0) { 2053 n = vmr->vmr_size - off; 2054 if (len < n) 2055 n = len; 2056 2057 to = (char *)vmr->vmr_va + off; 2058 if (buf == NULL) 2059 memset(to, 0, n); 2060 else { 2061 memcpy(to, from, n); 2062 from += n; 2063 } 2064 len -= n; 2065 off = 0; 2066 vmr++; 2067 } 2068 2069 return (0); 2070 } 2071 2072 /* 2073 * read_mem 2074 * 2075 * Reads memory at guest paddr 'src' into 'buf'. 2076 * 2077 * Parameters: 2078 * src: the source paddr_t in the guest VM to read from. 2079 * buf: destination (local) buffer 2080 * len: number of bytes to read 2081 * 2082 * Return values: 2083 * 0: success 2084 * EINVAL: if the guest physical memory range [dst, dst + len) does not 2085 * exist in the guest. 2086 */ 2087 int 2088 read_mem(paddr_t src, void *buf, size_t len) 2089 { 2090 char *from, *to = buf; 2091 size_t n, off; 2092 struct vm_mem_range *vmr; 2093 2094 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); 2095 if (vmr == NULL) { 2096 errno = EINVAL; 2097 log_warn("%s: failed - invalid memory range src = 0x%lx, " 2098 "len = 0x%zx", __func__, src, len); 2099 return (EINVAL); 2100 } 2101 2102 off = src - vmr->vmr_gpa; 2103 while (len != 0) { 2104 n = vmr->vmr_size - off; 2105 if (len < n) 2106 n = len; 2107 2108 from = (char *)vmr->vmr_va + off; 2109 memcpy(to, from, n); 2110 2111 to += n; 2112 len -= n; 2113 off = 0; 2114 vmr++; 2115 } 2116 2117 return (0); 2118 } 2119 2120 /* 2121 * hvaddr_mem 2122 * 2123 * Translate a guest physical address to a host virtual address, checking the 2124 * provided memory range length to confirm it's contiguous within the same 2125 * guest memory range (vm_mem_range). 2126 * 2127 * Parameters: 2128 * gpa: guest physical address to translate 2129 * len: number of bytes in the intended range 2130 * 2131 * Return values: 2132 * void* to host virtual memory on success 2133 * NULL on error, setting errno to: 2134 * EFAULT: gpa falls outside guest memory ranges 2135 * EINVAL: requested len extends beyond memory range 2136 */ 2137 void * 2138 hvaddr_mem(paddr_t gpa, size_t len) 2139 { 2140 struct vm_mem_range *vmr; 2141 size_t off; 2142 2143 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, gpa, len); 2144 if (vmr == NULL) { 2145 log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa); 2146 errno = EFAULT; 2147 return (NULL); 2148 } 2149 2150 off = gpa - vmr->vmr_gpa; 2151 if (len > (vmr->vmr_size - off)) { 2152 log_warnx("%s: failed - invalid memory range: gpa=0x%lx, " 2153 "len=%zu", __func__, gpa, len); 2154 errno = EINVAL; 2155 return (NULL); 2156 } 2157 2158 return ((char *)vmr->vmr_va + off); 2159 } 2160 2161 /* 2162 * vcpu_assert_pic_irq 2163 * 2164 * Injects the specified IRQ on the supplied vcpu/vm 2165 * 2166 * Parameters: 2167 * vm_id: VM ID to inject to 2168 * vcpu_id: VCPU ID to inject to 2169 * irq: IRQ to inject 2170 */ 2171 void 2172 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 2173 { 2174 int ret; 2175 2176 i8259_assert_irq(irq); 2177 2178 if (i8259_is_pending()) { 2179 if (vcpu_pic_intr(vm_id, vcpu_id, 1)) 2180 fatalx("%s: can't assert INTR", __func__); 2181 2182 ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]); 2183 if (ret) 2184 fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret); 2185 2186 vcpu_hlt[vcpu_id] = 0; 2187 ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); 2188 if (ret) 2189 fatalx("%s: can't signal (%d)", __func__, ret); 2190 ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]); 2191 if (ret) 2192 fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret); 2193 } 2194 } 2195 2196 /* 2197 * vcpu_deassert_pic_irq 2198 * 2199 * Clears the specified IRQ on the supplied vcpu/vm 2200 * 2201 * Parameters: 2202 * vm_id: VM ID to clear in 2203 * vcpu_id: VCPU ID to clear in 2204 * irq: IRQ to clear 2205 */ 2206 void 2207 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 2208 { 2209 i8259_deassert_irq(irq); 2210 2211 if (!i8259_is_pending()) { 2212 if (vcpu_pic_intr(vm_id, vcpu_id, 0)) 2213 fatalx("%s: can't deassert INTR for vm_id %d, " 2214 "vcpu_id %d", __func__, vm_id, vcpu_id); 2215 } 2216 } 2217 2218 /* 2219 * fd_hasdata 2220 * 2221 * Determines if data can be read from a file descriptor. 2222 * 2223 * Parameters: 2224 * fd: the fd to check 2225 * 2226 * Return values: 2227 * 1 if data can be read from an fd, or 0 otherwise. 2228 */ 2229 int 2230 fd_hasdata(int fd) 2231 { 2232 struct pollfd pfd[1]; 2233 int nready, hasdata = 0; 2234 2235 pfd[0].fd = fd; 2236 pfd[0].events = POLLIN; 2237 nready = poll(pfd, 1, 0); 2238 if (nready == -1) 2239 log_warn("checking file descriptor for data failed"); 2240 else if (nready == 1 && pfd[0].revents & POLLIN) 2241 hasdata = 1; 2242 return (hasdata); 2243 } 2244 2245 /* 2246 * mutex_lock 2247 * 2248 * Wrapper function for pthread_mutex_lock that does error checking and that 2249 * exits on failure 2250 */ 2251 void 2252 mutex_lock(pthread_mutex_t *m) 2253 { 2254 int ret; 2255 2256 ret = pthread_mutex_lock(m); 2257 if (ret) { 2258 errno = ret; 2259 fatal("could not acquire mutex"); 2260 } 2261 } 2262 2263 /* 2264 * mutex_unlock 2265 * 2266 * Wrapper function for pthread_mutex_unlock that does error checking and that 2267 * exits on failure 2268 */ 2269 void 2270 mutex_unlock(pthread_mutex_t *m) 2271 { 2272 int ret; 2273 2274 ret = pthread_mutex_unlock(m); 2275 if (ret) { 2276 errno = ret; 2277 fatal("could not release mutex"); 2278 } 2279 } 2280 2281 /* 2282 * set_return_data 2283 * 2284 * Utility function for manipulating register data in vm exit info structs. This 2285 * function ensures that the data is copied to the vei->vei.vei_data field with 2286 * the proper size for the operation being performed. 2287 * 2288 * Parameters: 2289 * vei: exit information 2290 * data: return data 2291 */ 2292 void 2293 set_return_data(struct vm_exit *vei, uint32_t data) 2294 { 2295 switch (vei->vei.vei_size) { 2296 case 1: 2297 vei->vei.vei_data &= ~0xFF; 2298 vei->vei.vei_data |= (uint8_t)data; 2299 break; 2300 case 2: 2301 vei->vei.vei_data &= ~0xFFFF; 2302 vei->vei.vei_data |= (uint16_t)data; 2303 break; 2304 case 4: 2305 vei->vei.vei_data = data; 2306 break; 2307 } 2308 } 2309 2310 /* 2311 * get_input_data 2312 * 2313 * Utility function for manipulating register data in vm exit info 2314 * structs. This function ensures that the data is copied from the 2315 * vei->vei.vei_data field with the proper size for the operation being 2316 * performed. 2317 * 2318 * Parameters: 2319 * vei: exit information 2320 * data: location to store the result 2321 */ 2322 void 2323 get_input_data(struct vm_exit *vei, uint32_t *data) 2324 { 2325 switch (vei->vei.vei_size) { 2326 case 1: 2327 *data &= 0xFFFFFF00; 2328 *data |= (uint8_t)vei->vei.vei_data; 2329 break; 2330 case 2: 2331 *data &= 0xFFFF0000; 2332 *data |= (uint16_t)vei->vei.vei_data; 2333 break; 2334 case 4: 2335 *data = vei->vei.vei_data; 2336 break; 2337 default: 2338 log_warnx("%s: invalid i/o size %d", __func__, 2339 vei->vei.vei_size); 2340 } 2341 2342 } 2343 2344 /* 2345 * translate_gva 2346 * 2347 * Translates a guest virtual address to a guest physical address by walking 2348 * the currently active page table (if needed). 2349 * 2350 * XXX ensure translate_gva updates the A bit in the PTE 2351 * XXX ensure translate_gva respects segment base and limits in i386 mode 2352 * XXX ensure translate_gva respects segment wraparound in i8086 mode 2353 * XXX ensure translate_gva updates the A bit in the segment selector 2354 * XXX ensure translate_gva respects CR4.LMSLE if available 2355 * 2356 * Parameters: 2357 * exit: The VCPU this translation should be performed for (guest MMU settings 2358 * are gathered from this VCPU) 2359 * va: virtual address to translate 2360 * pa: pointer to paddr_t variable that will receive the translated physical 2361 * address. 'pa' is unchanged on error. 2362 * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which 2363 * the address should be translated 2364 * 2365 * Return values: 2366 * 0: the address was successfully translated - 'pa' contains the physical 2367 * address currently mapped by 'va'. 2368 * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case 2369 * and %cr2 set in the vcpu structure. 2370 * EINVAL: an error occurred reading paging table structures 2371 */ 2372 int 2373 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode) 2374 { 2375 int level, shift, pdidx; 2376 uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask; 2377 uint64_t shift_width, pte_size; 2378 struct vcpu_reg_state *vrs; 2379 2380 vrs = &exit->vrs; 2381 2382 if (!pa) 2383 return (EINVAL); 2384 2385 if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) { 2386 log_debug("%s: unpaged, va=pa=0x%llx", __func__, va); 2387 *pa = va; 2388 return (0); 2389 } 2390 2391 pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3]; 2392 2393 log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__, 2394 vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]); 2395 2396 if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) { 2397 if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) { 2398 pte_size = sizeof(uint64_t); 2399 shift_width = 9; 2400 2401 if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) { 2402 /* 4 level paging */ 2403 level = 4; 2404 mask = L4_MASK; 2405 shift = L4_SHIFT; 2406 } else { 2407 /* 32 bit with PAE paging */ 2408 level = 3; 2409 mask = L3_MASK; 2410 shift = L3_SHIFT; 2411 } 2412 } else { 2413 /* 32 bit paging */ 2414 level = 2; 2415 shift_width = 10; 2416 mask = 0xFFC00000; 2417 shift = 22; 2418 pte_size = sizeof(uint32_t); 2419 } 2420 } else 2421 return (EINVAL); 2422 2423 /* XXX: Check for R bit in segment selector and set A bit */ 2424 2425 for (;level > 0; level--) { 2426 pdidx = (va & mask) >> shift; 2427 pte_paddr = (pt_paddr) + (pdidx * pte_size); 2428 2429 log_debug("%s: read pte level %d @ GPA 0x%llx", __func__, 2430 level, pte_paddr); 2431 if (read_mem(pte_paddr, &pte, pte_size)) { 2432 log_warn("%s: failed to read pte", __func__); 2433 return (EFAULT); 2434 } 2435 2436 log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr, 2437 pte); 2438 2439 /* XXX: Set CR2 */ 2440 if (!(pte & PG_V)) 2441 return (EFAULT); 2442 2443 /* XXX: Check for SMAP */ 2444 if ((mode == PROT_WRITE) && !(pte & PG_RW)) 2445 return (EPERM); 2446 2447 if ((exit->cpl > 0) && !(pte & PG_u)) 2448 return (EPERM); 2449 2450 pte = pte | PG_U; 2451 if (mode == PROT_WRITE) 2452 pte = pte | PG_M; 2453 if (write_mem(pte_paddr, &pte, pte_size)) { 2454 log_warn("%s: failed to write back flags to pte", 2455 __func__); 2456 return (EIO); 2457 } 2458 2459 /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */ 2460 if (pte & PG_PS) 2461 break; 2462 2463 if (level > 1) { 2464 pt_paddr = pte & PG_FRAME; 2465 shift -= shift_width; 2466 mask = mask >> shift_width; 2467 } 2468 } 2469 2470 low_mask = (1 << shift) - 1; 2471 high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask; 2472 *pa = (pte & high_mask) | (va & low_mask); 2473 2474 log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa); 2475 2476 return (0); 2477 } 2478 2479 /* 2480 * vm_pipe_init 2481 * 2482 * Initialize a vm_dev_pipe, setting up its file descriptors and its 2483 * event structure with the given callback. 2484 * 2485 * Parameters: 2486 * p: pointer to vm_dev_pipe struct to initizlize 2487 * cb: callback to use for READ events on the read end of the pipe 2488 */ 2489 void 2490 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *)) 2491 { 2492 int ret; 2493 int fds[2]; 2494 2495 memset(p, 0, sizeof(struct vm_dev_pipe)); 2496 2497 ret = pipe(fds); 2498 if (ret) 2499 fatal("failed to create vm_dev_pipe pipe"); 2500 2501 p->read = fds[0]; 2502 p->write = fds[1]; 2503 2504 event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL); 2505 } 2506 2507 /* 2508 * vm_pipe_send 2509 * 2510 * Send a message to an emulated device vie the provided vm_dev_pipe. 2511 * 2512 * Parameters: 2513 * p: pointer to initialized vm_dev_pipe 2514 * msg: message to send in the channel 2515 */ 2516 void 2517 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg) 2518 { 2519 size_t n; 2520 n = write(p->write, &msg, sizeof(msg)); 2521 if (n != sizeof(msg)) 2522 fatal("failed to write to device pipe"); 2523 } 2524 2525 /* 2526 * vm_pipe_recv 2527 * 2528 * Receive a message for an emulated device via the provided vm_dev_pipe. 2529 * Returns the message value, otherwise will exit on failure. 2530 * 2531 * Parameters: 2532 * p: pointer to initialized vm_dev_pipe 2533 * 2534 * Return values: 2535 * a value of enum pipe_msg_type or fatal exit on read(2) error 2536 */ 2537 enum pipe_msg_type 2538 vm_pipe_recv(struct vm_dev_pipe *p) 2539 { 2540 size_t n; 2541 enum pipe_msg_type msg; 2542 n = read(p->read, &msg, sizeof(msg)); 2543 if (n != sizeof(msg)) 2544 fatal("failed to read from device pipe"); 2545 2546 return msg; 2547 } 2548 2549 /* 2550 * Re-map the guest address space using the shared memory file descriptor. 2551 * 2552 * Returns 0 on success, non-zero in event of failure. 2553 */ 2554 int 2555 remap_guest_mem(struct vmd_vm *vm) 2556 { 2557 struct vm_create_params *vcp; 2558 struct vm_mem_range *vmr; 2559 size_t i, j; 2560 void *p = NULL; 2561 int ret; 2562 2563 if (vm == NULL) 2564 return (1); 2565 2566 vcp = &vm->vm_params.vmc_params; 2567 2568 /* 2569 * We've execve'd, so we need to re-map the guest VM memory. Iterate 2570 * over all possible vm_mem_range entries so we can initialize all 2571 * file descriptors to a value. 2572 */ 2573 for (i = 0; i < VMM_MAX_MEM_RANGES; i++) { 2574 if (i < vcp->vcp_nmemranges) { 2575 vmr = &vcp->vcp_memranges[i]; 2576 /* Skip ranges we know we don't need right now. */ 2577 if (vmr->vmr_type == VM_MEM_MMIO) { 2578 log_debug("%s: skipping range i=%ld, type=%d", 2579 __func__, i, vmr->vmr_type); 2580 vm->vm_memfds[i] = -1; 2581 continue; 2582 } 2583 /* Re-mmap the memrange. */ 2584 p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, 2585 MAP_SHARED | MAP_CONCEAL, vm->vm_memfds[i], 0); 2586 if (p == MAP_FAILED) { 2587 ret = errno; 2588 log_warn("%s: mmap", __func__); 2589 for (j = 0; j < i; j++) { 2590 vmr = &vcp->vcp_memranges[j]; 2591 munmap((void *)vmr->vmr_va, 2592 vmr->vmr_size); 2593 } 2594 return (ret); 2595 } 2596 vmr->vmr_va = (vaddr_t)p; 2597 } else { 2598 /* Initialize with an invalid fd. */ 2599 vm->vm_memfds[i] = -1; 2600 } 2601 } 2602 2603 return (0); 2604 } 2605