1 /* $OpenBSD: vm.c,v 1.71 2022/06/29 17:39:54 dv Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/param.h> /* PAGE_SIZE */ 20 #include <sys/types.h> 21 #include <sys/ioctl.h> 22 #include <sys/queue.h> 23 #include <sys/wait.h> 24 #include <sys/uio.h> 25 #include <sys/stat.h> 26 #include <sys/socket.h> 27 #include <sys/time.h> 28 #include <sys/mman.h> 29 #include <sys/resource.h> 30 31 #include <dev/ic/i8253reg.h> 32 #include <dev/isa/isareg.h> 33 #include <dev/pci/pcireg.h> 34 35 #include <machine/psl.h> 36 #include <machine/pte.h> 37 #include <machine/specialreg.h> 38 #include <machine/vmmvar.h> 39 40 #include <net/if.h> 41 42 #include <errno.h> 43 #include <event.h> 44 #include <fcntl.h> 45 #include <imsg.h> 46 #include <limits.h> 47 #include <poll.h> 48 #include <pthread.h> 49 #include <stddef.h> 50 #include <stdio.h> 51 #include <stdlib.h> 52 #include <string.h> 53 #include <unistd.h> 54 #include <util.h> 55 56 #include "atomicio.h" 57 #include "fw_cfg.h" 58 #include "i8253.h" 59 #include "i8259.h" 60 #include "loadfile.h" 61 #include "mc146818.h" 62 #include "ns8250.h" 63 #include "pci.h" 64 #include "virtio.h" 65 #include "vmd.h" 66 #include "vmm.h" 67 68 #define MB(x) (x * 1024UL * 1024UL) 69 #define GB(x) (x * 1024UL * 1024UL * 1024UL) 70 71 io_fn_t ioports_map[MAX_PORTS]; 72 73 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *, 74 struct vmop_create_params *, struct vcpu_reg_state *); 75 void vm_dispatch_vmm(int, short, void *); 76 void *event_thread(void *); 77 void *vcpu_run_loop(void *); 78 int vcpu_exit(struct vm_run_params *); 79 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); 80 void create_memory_map(struct vm_create_params *); 81 int alloc_guest_mem(struct vm_create_params *); 82 int vmm_create_vm(struct vm_create_params *); 83 void init_emulated_hw(struct vmop_create_params *, int, 84 int[][VM_MAX_BASE_PER_DISK], int *); 85 void restore_emulated_hw(struct vm_create_params *, int, int *, 86 int[][VM_MAX_BASE_PER_DISK],int); 87 void vcpu_exit_inout(struct vm_run_params *); 88 int vcpu_exit_eptviolation(struct vm_run_params *); 89 uint8_t vcpu_exit_pci(struct vm_run_params *); 90 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); 91 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *); 92 int send_vm(int, struct vm_create_params *); 93 int dump_send_header(int); 94 int dump_vmr(int , struct vm_mem_range *); 95 int dump_mem(int, struct vm_create_params *); 96 void restore_vmr(int, struct vm_mem_range *); 97 void restore_mem(int, struct vm_create_params *); 98 int restore_vm_params(int, struct vm_create_params *); 99 void pause_vm(struct vm_create_params *); 100 void unpause_vm(struct vm_create_params *); 101 102 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int); 103 104 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, 105 size_t); 106 107 int con_fd; 108 struct vmd_vm *current_vm; 109 110 extern struct vmd *env; 111 112 extern char *__progname; 113 114 pthread_mutex_t threadmutex; 115 pthread_cond_t threadcond; 116 117 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM]; 118 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM]; 119 pthread_barrier_t vm_pause_barrier; 120 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM]; 121 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM]; 122 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM]; 123 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM]; 124 125 /* 126 * Represents a standard register set for an OS to be booted 127 * as a flat 64 bit address space. 128 * 129 * NOT set here are: 130 * RIP 131 * RSP 132 * GDTR BASE 133 * 134 * Specific bootloaders should clone this structure and override 135 * those fields as needed. 136 * 137 * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on 138 * features of the CPU in use. 139 */ 140 static const struct vcpu_reg_state vcpu_init_flat64 = { 141 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 142 .vrs_gprs[VCPU_REGS_RIP] = 0x0, 143 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 144 .vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG, 145 .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE, 146 .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE, 147 .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL, 148 .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL, 149 .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL, 150 .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL, 151 .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, 152 .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 153 .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 154 .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 155 .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 156 .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 157 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 158 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 159 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 160 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 161 .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA, 162 .vrs_drs[VCPU_REGS_DR0] = 0x0, 163 .vrs_drs[VCPU_REGS_DR1] = 0x0, 164 .vrs_drs[VCPU_REGS_DR2] = 0x0, 165 .vrs_drs[VCPU_REGS_DR3] = 0x0, 166 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 167 .vrs_drs[VCPU_REGS_DR7] = 0x400, 168 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 169 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 170 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 171 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 172 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 173 .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL, 174 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 175 }; 176 177 /* 178 * Represents a standard register set for an BIOS to be booted 179 * as a flat 16 bit address space. 180 */ 181 static const struct vcpu_reg_state vcpu_init_flat16 = { 182 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 183 .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0, 184 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 185 .vrs_crs[VCPU_REGS_CR0] = 0x60000010, 186 .vrs_crs[VCPU_REGS_CR3] = 0, 187 .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000}, 188 .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 189 .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0}, 190 .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 191 .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 192 .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 193 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 194 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 195 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 196 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 197 .vrs_msrs[VCPU_REGS_EFER] = 0ULL, 198 .vrs_drs[VCPU_REGS_DR0] = 0x0, 199 .vrs_drs[VCPU_REGS_DR1] = 0x0, 200 .vrs_drs[VCPU_REGS_DR2] = 0x0, 201 .vrs_drs[VCPU_REGS_DR3] = 0x0, 202 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 203 .vrs_drs[VCPU_REGS_DR7] = 0x400, 204 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 205 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 206 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 207 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 208 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 209 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 210 }; 211 212 /* 213 * loadfile_bios 214 * 215 * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image 216 * directly into memory. 217 * 218 * Parameters: 219 * fp: file of a kernel file to load 220 * size: uncompressed size of the image 221 * (out) vrs: register state to set on init for this kernel 222 * 223 * Return values: 224 * 0 if successful 225 * various error codes returned from read(2) or loadelf functions 226 */ 227 int 228 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs) 229 { 230 off_t off; 231 232 /* Set up a "flat 16 bit" register state for BIOS */ 233 memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs)); 234 235 /* Seek to the beginning of the BIOS image */ 236 if (gzseek(fp, 0, SEEK_SET) == -1) 237 return (-1); 238 239 /* The BIOS image must end at 1MB */ 240 if ((off = MB(1) - size) < 0) 241 return (-1); 242 243 /* Read BIOS image into memory */ 244 if (mread(fp, off, size) != (size_t)size) { 245 errno = EIO; 246 return (-1); 247 } 248 249 if (gzseek(fp, 0, SEEK_SET) == -1) 250 return (-1); 251 252 /* Read a second BIOS copy into memory ending at 4GB */ 253 off = GB(4) - size; 254 if (mread(fp, off, size) != (size_t)size) { 255 errno = EIO; 256 return (-1); 257 } 258 259 log_debug("%s: loaded BIOS image", __func__); 260 261 return (0); 262 } 263 264 /* 265 * start_vm 266 * 267 * After forking a new VM process, starts the new VM with the creation 268 * parameters supplied (in the incoming vm->vm_params field). This 269 * function performs a basic sanity check on the incoming parameters 270 * and then performs the following steps to complete the creation of the VM: 271 * 272 * 1. validates and create the new VM 273 * 2. opens the imsg control channel to the parent and drops more privilege 274 * 3. drops additional privleges by calling pledge(2) 275 * 4. loads the kernel from the disk image or file descriptor 276 * 5. runs the VM's VCPU loops. 277 * 278 * Parameters: 279 * vm: The VM data structure that is including the VM create parameters. 280 * fd: The imsg socket that is connected to the parent process. 281 * 282 * Return values: 283 * 0: success 284 * !0 : failure - typically an errno indicating the source of the failure 285 */ 286 int 287 start_vm(struct vmd_vm *vm, int fd) 288 { 289 struct vmop_create_params *vmc = &vm->vm_params; 290 struct vm_create_params *vcp = &vmc->vmc_params; 291 struct vcpu_reg_state vrs; 292 int nicfds[VMM_MAX_NICS_PER_VM]; 293 int ret; 294 gzFile fp; 295 size_t i; 296 struct vm_rwregs_params vrp; 297 struct stat sb; 298 299 /* Child */ 300 setproctitle("%s", vcp->vcp_name); 301 log_procinit(vcp->vcp_name); 302 303 if (!(vm->vm_state & VM_STATE_RECEIVED)) 304 create_memory_map(vcp); 305 306 ret = alloc_guest_mem(vcp); 307 308 if (ret) { 309 struct rlimit lim; 310 char buf[FMT_SCALED_STRSIZE]; 311 if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) { 312 if (fmt_scaled(lim.rlim_cur, buf) == 0) 313 fatalx("could not allocate guest memory (data " 314 "limit is %s)", buf); 315 } 316 errno = ret; 317 fatal("could not allocate guest memory"); 318 } 319 320 ret = vmm_create_vm(vcp); 321 current_vm = vm; 322 323 /* send back the kernel-generated vm id (0 on error) */ 324 if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != 325 sizeof(vcp->vcp_id)) 326 fatal("failed to send created vm id to vmm process"); 327 328 if (ret) { 329 errno = ret; 330 fatal("create vmm ioctl failed - exiting"); 331 } 332 333 /* 334 * pledge in the vm processes: 335 * stdio - for malloc and basic I/O including events. 336 * recvfd - for send/recv. 337 * vmm - for the vmm ioctls and operations. 338 */ 339 if (pledge("stdio vmm recvfd", NULL) == -1) 340 fatal("pledge"); 341 342 if (vm->vm_state & VM_STATE_RECEIVED) { 343 ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp)); 344 if (ret != sizeof(vrp)) 345 fatal("received incomplete vrp - exiting"); 346 vrs = vrp.vrwp_regs; 347 } else { 348 /* 349 * Set up default "flat 64 bit" register state - RIP, 350 * RSP, and GDT info will be set in bootloader 351 */ 352 memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs)); 353 354 /* Find and open kernel image */ 355 if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL) 356 fatalx("failed to open kernel - exiting"); 357 358 /* Load kernel image */ 359 ret = loadfile_elf(fp, vcp, &vrs, vmc->vmc_bootdevice); 360 361 /* 362 * Try BIOS as a fallback (only if it was provided as an image 363 * with vm->vm_kernel and the file is not compressed) 364 */ 365 if (ret && errno == ENOEXEC && vm->vm_kernel != -1 && 366 gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0) 367 ret = loadfile_bios(fp, sb.st_size, &vrs); 368 369 if (ret) 370 fatal("failed to load kernel or BIOS - exiting"); 371 372 gzclose(fp); 373 } 374 375 if (vm->vm_kernel != -1) 376 close(vm->vm_kernel); 377 378 con_fd = vm->vm_tty; 379 if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) 380 fatal("failed to set nonblocking mode on console"); 381 382 for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) 383 nicfds[i] = vm->vm_ifs[i].vif_fd; 384 385 event_init(); 386 387 if (vm->vm_state & VM_STATE_RECEIVED) { 388 restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds, 389 vm->vm_disks, vm->vm_cdrom); 390 restore_mem(vm->vm_receive_fd, vcp); 391 if (restore_vm_params(vm->vm_receive_fd, vcp)) 392 fatal("restore vm params failed"); 393 unpause_vm(vcp); 394 } 395 396 if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) 397 fatal("setup vm pipe"); 398 399 /* Execute the vcpu run loop(s) for this VM */ 400 ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs); 401 402 /* Ensure that any in-flight data is written back */ 403 virtio_shutdown(vm); 404 405 return (ret); 406 } 407 408 /* 409 * vm_dispatch_vmm 410 * 411 * imsg callback for messages that are received from the vmm parent process. 412 */ 413 void 414 vm_dispatch_vmm(int fd, short event, void *arg) 415 { 416 struct vmd_vm *vm = arg; 417 struct vmop_result vmr; 418 struct vmop_addr_result var; 419 struct imsgev *iev = &vm->vm_iev; 420 struct imsgbuf *ibuf = &iev->ibuf; 421 struct imsg imsg; 422 ssize_t n; 423 int verbose; 424 425 if (event & EV_READ) { 426 if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) 427 fatal("%s: imsg_read", __func__); 428 if (n == 0) 429 _exit(0); 430 } 431 432 if (event & EV_WRITE) { 433 if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) 434 fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd); 435 if (n == 0) 436 _exit(0); 437 } 438 439 for (;;) { 440 if ((n = imsg_get(ibuf, &imsg)) == -1) 441 fatal("%s: imsg_get", __func__); 442 if (n == 0) 443 break; 444 445 #if DEBUG > 1 446 log_debug("%s: got imsg %d from %s", 447 __func__, imsg.hdr.type, 448 vm->vm_params.vmc_params.vcp_name); 449 #endif 450 451 switch (imsg.hdr.type) { 452 case IMSG_CTL_VERBOSE: 453 IMSG_SIZE_CHECK(&imsg, &verbose); 454 memcpy(&verbose, imsg.data, sizeof(verbose)); 455 log_setverbose(verbose); 456 break; 457 case IMSG_VMDOP_VM_SHUTDOWN: 458 if (vmmci_ctl(VMMCI_SHUTDOWN) == -1) 459 _exit(0); 460 break; 461 case IMSG_VMDOP_VM_REBOOT: 462 if (vmmci_ctl(VMMCI_REBOOT) == -1) 463 _exit(0); 464 break; 465 case IMSG_VMDOP_PAUSE_VM: 466 vmr.vmr_result = 0; 467 vmr.vmr_id = vm->vm_vmid; 468 pause_vm(&vm->vm_params.vmc_params); 469 imsg_compose_event(&vm->vm_iev, 470 IMSG_VMDOP_PAUSE_VM_RESPONSE, 471 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 472 sizeof(vmr)); 473 break; 474 case IMSG_VMDOP_UNPAUSE_VM: 475 vmr.vmr_result = 0; 476 vmr.vmr_id = vm->vm_vmid; 477 unpause_vm(&vm->vm_params.vmc_params); 478 imsg_compose_event(&vm->vm_iev, 479 IMSG_VMDOP_UNPAUSE_VM_RESPONSE, 480 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 481 sizeof(vmr)); 482 break; 483 case IMSG_VMDOP_SEND_VM_REQUEST: 484 vmr.vmr_id = vm->vm_vmid; 485 vmr.vmr_result = send_vm(imsg.fd, 486 &vm->vm_params.vmc_params); 487 imsg_compose_event(&vm->vm_iev, 488 IMSG_VMDOP_SEND_VM_RESPONSE, 489 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 490 sizeof(vmr)); 491 if (!vmr.vmr_result) { 492 imsg_flush(¤t_vm->vm_iev.ibuf); 493 _exit(0); 494 } 495 break; 496 case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE: 497 IMSG_SIZE_CHECK(&imsg, &var); 498 memcpy(&var, imsg.data, sizeof(var)); 499 500 log_debug("%s: received tap addr %s for nic %d", 501 vm->vm_params.vmc_params.vcp_name, 502 ether_ntoa((void *)var.var_addr), var.var_nic_idx); 503 504 vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr); 505 break; 506 default: 507 fatalx("%s: got invalid imsg %d from %s", 508 __func__, imsg.hdr.type, 509 vm->vm_params.vmc_params.vcp_name); 510 } 511 imsg_free(&imsg); 512 } 513 imsg_event_add(iev); 514 } 515 516 /* 517 * vm_shutdown 518 * 519 * Tell the vmm parent process to shutdown or reboot the VM and exit. 520 */ 521 __dead void 522 vm_shutdown(unsigned int cmd) 523 { 524 switch (cmd) { 525 case VMMCI_NONE: 526 case VMMCI_SHUTDOWN: 527 (void)imsg_compose_event(¤t_vm->vm_iev, 528 IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0); 529 break; 530 case VMMCI_REBOOT: 531 (void)imsg_compose_event(¤t_vm->vm_iev, 532 IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0); 533 break; 534 default: 535 fatalx("invalid vm ctl command: %d", cmd); 536 } 537 imsg_flush(¤t_vm->vm_iev.ibuf); 538 539 _exit(0); 540 } 541 542 int 543 send_vm(int fd, struct vm_create_params *vcp) 544 { 545 struct vm_rwregs_params vrp; 546 struct vm_rwvmparams_params vpp; 547 struct vmop_create_params *vmc; 548 struct vm_terminate_params vtp; 549 unsigned int flags = 0; 550 unsigned int i; 551 int ret = 0; 552 size_t sz; 553 554 if (dump_send_header(fd)) { 555 log_info("%s: failed to send vm dump header", __func__); 556 goto err; 557 } 558 559 pause_vm(vcp); 560 561 vmc = calloc(1, sizeof(struct vmop_create_params)); 562 if (vmc == NULL) { 563 log_warn("%s: calloc error geting vmc", __func__); 564 ret = -1; 565 goto err; 566 } 567 568 flags |= VMOP_CREATE_MEMORY; 569 memcpy(&vmc->vmc_params, ¤t_vm->vm_params, sizeof(struct 570 vmop_create_params)); 571 vmc->vmc_flags = flags; 572 vrp.vrwp_vm_id = vcp->vcp_id; 573 vrp.vrwp_mask = VM_RWREGS_ALL; 574 vpp.vpp_mask = VM_RWVMPARAMS_ALL; 575 vpp.vpp_vm_id = vcp->vcp_id; 576 577 sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params)); 578 if (sz != sizeof(struct vmop_create_params)) { 579 ret = -1; 580 goto err; 581 } 582 583 for (i = 0; i < vcp->vcp_ncpus; i++) { 584 vrp.vrwp_vcpu_id = i; 585 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) { 586 log_warn("%s: readregs failed", __func__); 587 goto err; 588 } 589 590 sz = atomicio(vwrite, fd, &vrp, 591 sizeof(struct vm_rwregs_params)); 592 if (sz != sizeof(struct vm_rwregs_params)) { 593 log_warn("%s: dumping registers failed", __func__); 594 ret = -1; 595 goto err; 596 } 597 } 598 599 if ((ret = i8253_dump(fd))) 600 goto err; 601 if ((ret = i8259_dump(fd))) 602 goto err; 603 if ((ret = ns8250_dump(fd))) 604 goto err; 605 if ((ret = mc146818_dump(fd))) 606 goto err; 607 if ((ret = fw_cfg_dump(fd))) 608 goto err; 609 if ((ret = pci_dump(fd))) 610 goto err; 611 if ((ret = virtio_dump(fd))) 612 goto err; 613 if ((ret = dump_mem(fd, vcp))) 614 goto err; 615 616 for (i = 0; i < vcp->vcp_ncpus; i++) { 617 vpp.vpp_vcpu_id = i; 618 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) { 619 log_warn("%s: readvmparams failed", __func__); 620 goto err; 621 } 622 623 sz = atomicio(vwrite, fd, &vpp, 624 sizeof(struct vm_rwvmparams_params)); 625 if (sz != sizeof(struct vm_rwvmparams_params)) { 626 log_warn("%s: dumping vm params failed", __func__); 627 ret = -1; 628 goto err; 629 } 630 } 631 632 vtp.vtp_vm_id = vcp->vcp_id; 633 if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) { 634 log_warnx("%s: term IOC error: %d, %d", __func__, 635 errno, ENOENT); 636 } 637 err: 638 close(fd); 639 if (ret) 640 unpause_vm(vcp); 641 return ret; 642 } 643 644 int 645 dump_send_header(int fd) { 646 struct vm_dump_header vmh; 647 int i; 648 649 memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE, 650 sizeof(vmh.vmh_signature)); 651 652 vmh.vmh_cpuids[0].code = 0x00; 653 vmh.vmh_cpuids[0].leaf = 0x00; 654 655 vmh.vmh_cpuids[1].code = 0x01; 656 vmh.vmh_cpuids[1].leaf = 0x00; 657 658 vmh.vmh_cpuids[2].code = 0x07; 659 vmh.vmh_cpuids[2].leaf = 0x00; 660 661 vmh.vmh_cpuids[3].code = 0x0d; 662 vmh.vmh_cpuids[3].leaf = 0x00; 663 664 vmh.vmh_cpuids[4].code = 0x80000001; 665 vmh.vmh_cpuids[4].leaf = 0x00; 666 667 vmh.vmh_version = VM_DUMP_VERSION; 668 669 for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { 670 CPUID_LEAF(vmh.vmh_cpuids[i].code, 671 vmh.vmh_cpuids[i].leaf, 672 vmh.vmh_cpuids[i].a, 673 vmh.vmh_cpuids[i].b, 674 vmh.vmh_cpuids[i].c, 675 vmh.vmh_cpuids[i].d); 676 } 677 678 if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh)) 679 return (-1); 680 681 return (0); 682 } 683 684 int 685 dump_mem(int fd, struct vm_create_params *vcp) 686 { 687 unsigned int i; 688 int ret; 689 struct vm_mem_range *vmr; 690 691 for (i = 0; i < vcp->vcp_nmemranges; i++) { 692 vmr = &vcp->vcp_memranges[i]; 693 ret = dump_vmr(fd, vmr); 694 if (ret) 695 return ret; 696 } 697 return (0); 698 } 699 700 int 701 restore_vm_params(int fd, struct vm_create_params *vcp) { 702 unsigned int i; 703 struct vm_rwvmparams_params vpp; 704 705 for (i = 0; i < vcp->vcp_ncpus; i++) { 706 if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) { 707 log_warn("%s: error restoring vm params", __func__); 708 return (-1); 709 } 710 vpp.vpp_vm_id = vcp->vcp_id; 711 vpp.vpp_vcpu_id = i; 712 if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) { 713 log_debug("%s: writing vm params failed", __func__); 714 return (-1); 715 } 716 } 717 return (0); 718 } 719 720 void 721 restore_mem(int fd, struct vm_create_params *vcp) 722 { 723 unsigned int i; 724 struct vm_mem_range *vmr; 725 726 for (i = 0; i < vcp->vcp_nmemranges; i++) { 727 vmr = &vcp->vcp_memranges[i]; 728 restore_vmr(fd, vmr); 729 } 730 } 731 732 int 733 dump_vmr(int fd, struct vm_mem_range *vmr) 734 { 735 size_t rem = vmr->vmr_size, read=0; 736 char buf[PAGE_SIZE]; 737 738 while (rem > 0) { 739 if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) { 740 log_warn("failed to read vmr"); 741 return (-1); 742 } 743 if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) { 744 log_warn("failed to dump vmr"); 745 return (-1); 746 } 747 rem = rem - PAGE_SIZE; 748 read = read + PAGE_SIZE; 749 } 750 return (0); 751 } 752 753 void 754 restore_vmr(int fd, struct vm_mem_range *vmr) 755 { 756 size_t rem = vmr->vmr_size, wrote=0; 757 char buf[PAGE_SIZE]; 758 759 while (rem > 0) { 760 if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf)) 761 fatal("failed to restore vmr"); 762 if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE)) 763 fatal("failed to write vmr"); 764 rem = rem - PAGE_SIZE; 765 wrote = wrote + PAGE_SIZE; 766 } 767 } 768 769 void 770 pause_vm(struct vm_create_params *vcp) 771 { 772 unsigned int n; 773 int ret; 774 if (current_vm->vm_state & VM_STATE_PAUSED) 775 return; 776 777 current_vm->vm_state |= VM_STATE_PAUSED; 778 779 ret = pthread_barrier_init(&vm_pause_barrier, NULL, vcp->vcp_ncpus + 1); 780 if (ret) { 781 log_warnx("%s: cannot initialize pause barrier (%d)", 782 __progname, ret); 783 return; 784 } 785 786 for (n = 0; n < vcp->vcp_ncpus; n++) { 787 ret = pthread_cond_broadcast(&vcpu_run_cond[n]); 788 if (ret) { 789 log_warnx("%s: can't broadcast vcpu run cond (%d)", 790 __func__, (int)ret); 791 return; 792 } 793 } 794 ret = pthread_barrier_wait(&vm_pause_barrier); 795 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 796 log_warnx("%s: could not wait on pause barrier (%d)", 797 __func__, (int)ret); 798 return; 799 } 800 801 ret = pthread_barrier_destroy(&vm_pause_barrier); 802 if (ret) { 803 log_warnx("%s: could not destroy pause barrier (%d)", 804 __progname, ret); 805 return; 806 } 807 808 i8253_stop(); 809 mc146818_stop(); 810 ns8250_stop(); 811 virtio_stop(vcp); 812 } 813 814 void 815 unpause_vm(struct vm_create_params *vcp) 816 { 817 unsigned int n; 818 int ret; 819 if (!(current_vm->vm_state & VM_STATE_PAUSED)) 820 return; 821 822 current_vm->vm_state &= ~VM_STATE_PAUSED; 823 for (n = 0; n < vcp->vcp_ncpus; n++) { 824 ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]); 825 if (ret) { 826 log_warnx("%s: can't broadcast vcpu unpause cond (%d)", 827 __func__, (int)ret); 828 return; 829 } 830 } 831 832 i8253_start(); 833 mc146818_start(); 834 ns8250_start(); 835 virtio_start(vcp); 836 } 837 838 /* 839 * vcpu_reset 840 * 841 * Requests vmm(4) to reset the VCPUs in the indicated VM to 842 * the register state provided 843 * 844 * Parameters 845 * vmid: VM ID to reset 846 * vcpu_id: VCPU ID to reset 847 * vrs: the register state to initialize 848 * 849 * Return values: 850 * 0: success 851 * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not 852 * valid) 853 */ 854 int 855 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) 856 { 857 struct vm_resetcpu_params vrp; 858 859 memset(&vrp, 0, sizeof(vrp)); 860 vrp.vrp_vm_id = vmid; 861 vrp.vrp_vcpu_id = vcpu_id; 862 memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state)); 863 864 log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid); 865 866 if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1) 867 return (errno); 868 869 return (0); 870 } 871 872 /* 873 * create_memory_map 874 * 875 * Sets up the guest physical memory ranges that the VM can access. 876 * 877 * Parameters: 878 * vcp: VM create parameters describing the VM whose memory map 879 * is being created 880 * 881 * Return values: 882 * nothing 883 */ 884 void 885 create_memory_map(struct vm_create_params *vcp) 886 { 887 size_t len, mem_bytes; 888 size_t above_1m = 0, above_4g = 0; 889 890 mem_bytes = vcp->vcp_memranges[0].vmr_size; 891 vcp->vcp_nmemranges = 0; 892 if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE) 893 return; 894 895 /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ 896 len = LOWMEM_KB * 1024; 897 vcp->vcp_memranges[0].vmr_gpa = 0x0; 898 vcp->vcp_memranges[0].vmr_size = len; 899 mem_bytes -= len; 900 901 /* 902 * Second memory region: LOWMEM_KB - 1MB. 903 * 904 * N.B. - Normally ROMs or parts of video RAM are mapped here. 905 * We have to add this region, because some systems 906 * unconditionally write to 0xb8000 (VGA RAM), and 907 * we need to make sure that vmm(4) permits accesses 908 * to it. So allocate guest memory for it. 909 */ 910 len = MB(1) - (LOWMEM_KB * 1024); 911 vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; 912 vcp->vcp_memranges[1].vmr_size = len; 913 mem_bytes -= len; 914 915 /* If we have less than 2MB remaining, still create a 2nd BIOS area. */ 916 if (mem_bytes <= MB(2)) { 917 vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END; 918 vcp->vcp_memranges[2].vmr_size = MB(2); 919 vcp->vcp_nmemranges = 3; 920 return; 921 } 922 923 /* 924 * Calculate the how to split any remaining memory across the 4GB 925 * boundary while making sure we do not place physical memory into 926 * MMIO ranges. 927 */ 928 if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) { 929 above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1); 930 above_4g = mem_bytes - above_1m; 931 } else { 932 above_1m = mem_bytes; 933 above_4g = 0; 934 } 935 936 /* Third memory region: area above 1MB to MMIO region */ 937 vcp->vcp_memranges[2].vmr_gpa = MB(1); 938 vcp->vcp_memranges[2].vmr_size = above_1m; 939 940 /* Fourth region: 2nd copy of BIOS above MMIO ending at 4GB */ 941 vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; 942 vcp->vcp_memranges[3].vmr_size = MB(2); 943 944 /* Fifth region: any remainder above 4GB */ 945 if (above_4g > 0) { 946 vcp->vcp_memranges[4].vmr_gpa = GB(4); 947 vcp->vcp_memranges[4].vmr_size = above_4g; 948 vcp->vcp_nmemranges = 5; 949 } else 950 vcp->vcp_nmemranges = 4; 951 } 952 953 /* 954 * alloc_guest_mem 955 * 956 * Allocates memory for the guest. 957 * Instead of doing a single allocation with one mmap(), we allocate memory 958 * separately for every range for the following reasons: 959 * - ASLR for the individual ranges 960 * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to 961 * map the single mmap'd userspace memory to the individual guest physical 962 * memory ranges, the underlying amap of the single mmap'd range would have 963 * to allocate per-page reference counters. The reason is that the 964 * individual guest physical ranges would reference the single mmap'd region 965 * only partially. However, if every guest physical range has its own 966 * corresponding mmap'd userspace allocation, there are no partial 967 * references: every guest physical range fully references an mmap'd 968 * range => no per-page reference counters have to be allocated. 969 * 970 * Return values: 971 * 0: success 972 * !0: failure - errno indicating the source of the failure 973 */ 974 int 975 alloc_guest_mem(struct vm_create_params *vcp) 976 { 977 void *p; 978 int ret; 979 size_t i, j; 980 struct vm_mem_range *vmr; 981 982 for (i = 0; i < vcp->vcp_nmemranges; i++) { 983 vmr = &vcp->vcp_memranges[i]; 984 p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, 985 MAP_PRIVATE | MAP_ANON, -1, 0); 986 if (p == MAP_FAILED) { 987 ret = errno; 988 for (j = 0; j < i; j++) { 989 vmr = &vcp->vcp_memranges[j]; 990 munmap((void *)vmr->vmr_va, vmr->vmr_size); 991 } 992 993 return (ret); 994 } 995 996 vmr->vmr_va = (vaddr_t)p; 997 } 998 999 return (0); 1000 } 1001 1002 /* 1003 * vmm_create_vm 1004 * 1005 * Requests vmm(4) to create a new VM using the supplied creation 1006 * parameters. This operation results in the creation of the in-kernel 1007 * structures for the VM, but does not start the VM's vcpu(s). 1008 * 1009 * Parameters: 1010 * vcp: vm_create_params struct containing the VM's desired creation 1011 * configuration 1012 * 1013 * Return values: 1014 * 0: success 1015 * !0 : ioctl to vmm(4) failed 1016 */ 1017 int 1018 vmm_create_vm(struct vm_create_params *vcp) 1019 { 1020 /* Sanity check arguments */ 1021 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 1022 return (EINVAL); 1023 1024 if (vcp->vcp_nmemranges == 0 || 1025 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1026 return (EINVAL); 1027 1028 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 1029 return (EINVAL); 1030 1031 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 1032 return (EINVAL); 1033 1034 if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1) 1035 return (errno); 1036 1037 return (0); 1038 } 1039 1040 /* 1041 * init_emulated_hw 1042 * 1043 * Initializes the userspace hardware emulation 1044 */ 1045 void 1046 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, 1047 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 1048 { 1049 struct vm_create_params *vcp = &vmc->vmc_params; 1050 size_t i; 1051 uint64_t memlo, memhi; 1052 1053 /* Calculate memory size for NVRAM registers */ 1054 memlo = memhi = 0; 1055 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1056 if (vcp->vcp_memranges[i].vmr_gpa == MB(1) && 1057 vcp->vcp_memranges[i].vmr_size > (15 * MB(1))) 1058 memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1)); 1059 else if (vcp->vcp_memranges[i].vmr_gpa == GB(4)) 1060 memhi = vcp->vcp_memranges[i].vmr_size; 1061 } 1062 1063 /* Reset the IO port map */ 1064 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1065 1066 /* Init i8253 PIT */ 1067 i8253_init(vcp->vcp_id); 1068 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1069 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1070 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1071 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1072 ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc; 1073 1074 /* Init mc146818 RTC */ 1075 mc146818_init(vcp->vcp_id, memlo, memhi); 1076 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1077 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1078 1079 /* Init master and slave PICs */ 1080 i8259_init(); 1081 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1082 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1083 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1084 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1085 ioports_map[ELCR0] = vcpu_exit_elcr; 1086 ioports_map[ELCR1] = vcpu_exit_elcr; 1087 1088 /* Init ns8250 UART */ 1089 ns8250_init(con_fd, vcp->vcp_id); 1090 for (i = COM1_DATA; i <= COM1_SCR; i++) 1091 ioports_map[i] = vcpu_exit_com; 1092 1093 /* Init QEMU fw_cfg interface */ 1094 fw_cfg_init(vmc); 1095 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1096 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1097 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1098 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1099 1100 /* Initialize PCI */ 1101 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 1102 ioports_map[i] = vcpu_exit_pci; 1103 1104 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1105 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1106 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1107 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1108 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1109 pci_init(); 1110 1111 /* Initialize virtio devices */ 1112 virtio_init(current_vm, child_cdrom, child_disks, child_taps); 1113 } 1114 /* 1115 * restore_emulated_hw 1116 * 1117 * Restores the userspace hardware emulation from fd 1118 */ 1119 void 1120 restore_emulated_hw(struct vm_create_params *vcp, int fd, 1121 int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom) 1122 { 1123 /* struct vm_create_params *vcp = &vmc->vmc_params; */ 1124 int i; 1125 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1126 1127 /* Init i8253 PIT */ 1128 i8253_restore(fd, vcp->vcp_id); 1129 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1130 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1131 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1132 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1133 1134 /* Init master and slave PICs */ 1135 i8259_restore(fd); 1136 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1137 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1138 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1139 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1140 1141 /* Init ns8250 UART */ 1142 ns8250_restore(fd, con_fd, vcp->vcp_id); 1143 for (i = COM1_DATA; i <= COM1_SCR; i++) 1144 ioports_map[i] = vcpu_exit_com; 1145 1146 /* Init mc146818 RTC */ 1147 mc146818_restore(fd, vcp->vcp_id); 1148 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1149 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1150 1151 /* Init QEMU fw_cfg interface */ 1152 fw_cfg_restore(fd); 1153 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1154 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1155 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1156 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1157 1158 /* Initialize PCI */ 1159 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 1160 ioports_map[i] = vcpu_exit_pci; 1161 1162 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1163 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1164 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1165 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1166 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1167 pci_restore(fd); 1168 virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps); 1169 } 1170 1171 /* 1172 * run_vm 1173 * 1174 * Runs the VM whose creation parameters are specified in vcp 1175 * 1176 * Parameters: 1177 * child_cdrom: previously-opened child ISO disk file descriptor 1178 * child_disks: previously-opened child VM disk file file descriptors 1179 * child_taps: previously-opened child tap file descriptors 1180 * vmc: vmop_create_params struct containing the VM's desired creation 1181 * configuration 1182 * vrs: VCPU register state to initialize 1183 * 1184 * Return values: 1185 * 0: the VM exited normally 1186 * !0 : the VM exited abnormally or failed to start 1187 */ 1188 int 1189 run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK], 1190 int *child_taps, struct vmop_create_params *vmc, 1191 struct vcpu_reg_state *vrs) 1192 { 1193 struct vm_create_params *vcp = &vmc->vmc_params; 1194 struct vm_rwregs_params vregsp; 1195 uint8_t evdone = 0; 1196 size_t i; 1197 int ret; 1198 pthread_t *tid, evtid; 1199 struct vm_run_params **vrp; 1200 void *exit_status; 1201 1202 if (vcp == NULL) 1203 return (EINVAL); 1204 1205 if (child_cdrom == -1 && strlen(vcp->vcp_cdrom)) 1206 return (EINVAL); 1207 1208 if (child_disks == NULL && vcp->vcp_ndisks != 0) 1209 return (EINVAL); 1210 1211 if (child_taps == NULL && vcp->vcp_nnics != 0) 1212 return (EINVAL); 1213 1214 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 1215 return (EINVAL); 1216 1217 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 1218 return (EINVAL); 1219 1220 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 1221 return (EINVAL); 1222 1223 if (vcp->vcp_nmemranges == 0 || 1224 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1225 return (EINVAL); 1226 1227 tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t)); 1228 vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *)); 1229 if (tid == NULL || vrp == NULL) { 1230 log_warn("%s: memory allocation error - exiting.", 1231 __progname); 1232 return (ENOMEM); 1233 } 1234 1235 log_debug("%s: initializing hardware for vm %s", __func__, 1236 vcp->vcp_name); 1237 1238 if (!(current_vm->vm_state & VM_STATE_RECEIVED)) 1239 init_emulated_hw(vmc, child_cdrom, child_disks, child_taps); 1240 1241 ret = pthread_mutex_init(&threadmutex, NULL); 1242 if (ret) { 1243 log_warn("%s: could not initialize thread state mutex", 1244 __func__); 1245 return (ret); 1246 } 1247 ret = pthread_cond_init(&threadcond, NULL); 1248 if (ret) { 1249 log_warn("%s: could not initialize thread state " 1250 "condition variable", __func__); 1251 return (ret); 1252 } 1253 1254 mutex_lock(&threadmutex); 1255 1256 log_debug("%s: starting vcpu threads for vm %s", __func__, 1257 vcp->vcp_name); 1258 1259 /* 1260 * Create and launch one thread for each VCPU. These threads may 1261 * migrate between PCPUs over time; the need to reload CPU state 1262 * in such situations is detected and performed by vmm(4) in the 1263 * kernel. 1264 */ 1265 for (i = 0 ; i < vcp->vcp_ncpus; i++) { 1266 vrp[i] = malloc(sizeof(struct vm_run_params)); 1267 if (vrp[i] == NULL) { 1268 log_warn("%s: memory allocation error - " 1269 "exiting.", __progname); 1270 /* caller will exit, so skip freeing */ 1271 return (ENOMEM); 1272 } 1273 vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit)); 1274 if (vrp[i]->vrp_exit == NULL) { 1275 log_warn("%s: memory allocation error - " 1276 "exiting.", __progname); 1277 /* caller will exit, so skip freeing */ 1278 return (ENOMEM); 1279 } 1280 vrp[i]->vrp_vm_id = vcp->vcp_id; 1281 vrp[i]->vrp_vcpu_id = i; 1282 1283 if (vcpu_reset(vcp->vcp_id, i, vrs)) { 1284 log_warnx("%s: cannot reset VCPU %zu - exiting.", 1285 __progname, i); 1286 return (EIO); 1287 } 1288 1289 /* once more because reset_cpu changes regs */ 1290 if (current_vm->vm_state & VM_STATE_RECEIVED) { 1291 vregsp.vrwp_vm_id = vcp->vcp_id; 1292 vregsp.vrwp_vcpu_id = i; 1293 vregsp.vrwp_regs = *vrs; 1294 vregsp.vrwp_mask = VM_RWREGS_ALL; 1295 if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS, 1296 &vregsp)) == -1) { 1297 log_warn("%s: writeregs failed", __func__); 1298 return (ret); 1299 } 1300 } 1301 1302 ret = pthread_cond_init(&vcpu_run_cond[i], NULL); 1303 if (ret) { 1304 log_warnx("%s: cannot initialize cond var (%d)", 1305 __progname, ret); 1306 return (ret); 1307 } 1308 1309 ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL); 1310 if (ret) { 1311 log_warnx("%s: cannot initialize mtx (%d)", 1312 __progname, ret); 1313 return (ret); 1314 } 1315 1316 ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL); 1317 if (ret) { 1318 log_warnx("%s: cannot initialize unpause var (%d)", 1319 __progname, ret); 1320 return (ret); 1321 } 1322 1323 ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL); 1324 if (ret) { 1325 log_warnx("%s: cannot initialize unpause mtx (%d)", 1326 __progname, ret); 1327 return (ret); 1328 } 1329 1330 vcpu_hlt[i] = 0; 1331 1332 /* Start each VCPU run thread at vcpu_run_loop */ 1333 ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); 1334 if (ret) { 1335 /* caller will _exit after this return */ 1336 ret = errno; 1337 log_warn("%s: could not create vcpu thread %zu", 1338 __func__, i); 1339 return (ret); 1340 } 1341 } 1342 1343 log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name); 1344 ret = pthread_create(&evtid, NULL, event_thread, &evdone); 1345 if (ret) { 1346 errno = ret; 1347 log_warn("%s: could not create event thread", __func__); 1348 return (ret); 1349 } 1350 1351 for (;;) { 1352 ret = pthread_cond_wait(&threadcond, &threadmutex); 1353 if (ret) { 1354 log_warn("%s: waiting on thread state condition " 1355 "variable failed", __func__); 1356 return (ret); 1357 } 1358 1359 /* 1360 * Did a VCPU thread exit with an error? => return the first one 1361 */ 1362 for (i = 0; i < vcp->vcp_ncpus; i++) { 1363 if (vcpu_done[i] == 0) 1364 continue; 1365 1366 if (pthread_join(tid[i], &exit_status)) { 1367 log_warn("%s: failed to join thread %zd - " 1368 "exiting", __progname, i); 1369 return (EIO); 1370 } 1371 1372 ret = (intptr_t)exit_status; 1373 } 1374 1375 /* Did the event thread exit? => return with an error */ 1376 if (evdone) { 1377 if (pthread_join(evtid, &exit_status)) { 1378 log_warn("%s: failed to join event thread - " 1379 "exiting", __progname); 1380 return (EIO); 1381 } 1382 1383 log_warnx("%s: vm %d event thread exited " 1384 "unexpectedly", __progname, vcp->vcp_id); 1385 return (EIO); 1386 } 1387 1388 /* Did all VCPU threads exit successfully? => return */ 1389 for (i = 0; i < vcp->vcp_ncpus; i++) { 1390 if (vcpu_done[i] == 0) 1391 break; 1392 } 1393 if (i == vcp->vcp_ncpus) 1394 return (ret); 1395 1396 /* Some more threads to wait for, start over */ 1397 } 1398 1399 return (ret); 1400 } 1401 1402 void * 1403 event_thread(void *arg) 1404 { 1405 uint8_t *donep = arg; 1406 intptr_t ret; 1407 1408 ret = event_dispatch(); 1409 1410 mutex_lock(&threadmutex); 1411 *donep = 1; 1412 pthread_cond_signal(&threadcond); 1413 mutex_unlock(&threadmutex); 1414 1415 return (void *)ret; 1416 } 1417 1418 /* 1419 * vcpu_run_loop 1420 * 1421 * Runs a single VCPU until vmm(4) requires help handling an exit, 1422 * or the VM terminates. 1423 * 1424 * Parameters: 1425 * arg: vcpu_run_params for the VCPU being run by this thread 1426 * 1427 * Return values: 1428 * NULL: the VCPU shutdown properly 1429 * !NULL: error processing VCPU run, or the VCPU shutdown abnormally 1430 */ 1431 void * 1432 vcpu_run_loop(void *arg) 1433 { 1434 struct vm_run_params *vrp = (struct vm_run_params *)arg; 1435 intptr_t ret = 0; 1436 int irq; 1437 uint32_t n; 1438 1439 vrp->vrp_continue = 0; 1440 n = vrp->vrp_vcpu_id; 1441 1442 for (;;) { 1443 ret = pthread_mutex_lock(&vcpu_run_mtx[n]); 1444 1445 if (ret) { 1446 log_warnx("%s: can't lock vcpu run mtx (%d)", 1447 __func__, (int)ret); 1448 return ((void *)ret); 1449 } 1450 1451 /* If we are halted and need to pause, pause */ 1452 if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) { 1453 ret = pthread_barrier_wait(&vm_pause_barrier); 1454 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 1455 log_warnx("%s: could not wait on pause barrier (%d)", 1456 __func__, (int)ret); 1457 return ((void *)ret); 1458 } 1459 1460 ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]); 1461 if (ret) { 1462 log_warnx("%s: can't lock vcpu unpause mtx (%d)", 1463 __func__, (int)ret); 1464 return ((void *)ret); 1465 } 1466 1467 ret = pthread_cond_wait(&vcpu_unpause_cond[n], 1468 &vcpu_unpause_mtx[n]); 1469 if (ret) { 1470 log_warnx( 1471 "%s: can't wait on unpause cond (%d)", 1472 __func__, (int)ret); 1473 break; 1474 } 1475 ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]); 1476 if (ret) { 1477 log_warnx("%s: can't unlock unpause mtx (%d)", 1478 __func__, (int)ret); 1479 break; 1480 } 1481 } 1482 1483 /* If we are halted and not paused, wait */ 1484 if (vcpu_hlt[n]) { 1485 ret = pthread_cond_wait(&vcpu_run_cond[n], 1486 &vcpu_run_mtx[n]); 1487 1488 if (ret) { 1489 log_warnx( 1490 "%s: can't wait on cond (%d)", 1491 __func__, (int)ret); 1492 (void)pthread_mutex_unlock( 1493 &vcpu_run_mtx[n]); 1494 break; 1495 } 1496 } 1497 1498 ret = pthread_mutex_unlock(&vcpu_run_mtx[n]); 1499 1500 if (ret) { 1501 log_warnx("%s: can't unlock mutex on cond (%d)", 1502 __func__, (int)ret); 1503 break; 1504 } 1505 1506 if (vrp->vrp_irqready && i8259_is_pending()) { 1507 irq = i8259_ack(); 1508 vrp->vrp_irq = irq; 1509 } else 1510 vrp->vrp_irq = 0xFFFF; 1511 1512 /* Still more pending? */ 1513 if (i8259_is_pending()) { 1514 /* 1515 * XXX can probably avoid ioctls here by providing intr 1516 * in vrp 1517 */ 1518 if (vcpu_pic_intr(vrp->vrp_vm_id, 1519 vrp->vrp_vcpu_id, 1)) { 1520 fatal("can't set INTR"); 1521 } 1522 } else { 1523 if (vcpu_pic_intr(vrp->vrp_vm_id, 1524 vrp->vrp_vcpu_id, 0)) { 1525 fatal("can't clear INTR"); 1526 } 1527 } 1528 1529 if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) { 1530 /* If run ioctl failed, exit */ 1531 ret = errno; 1532 log_warn("%s: vm %d / vcpu %d run ioctl failed", 1533 __func__, vrp->vrp_vm_id, n); 1534 break; 1535 } 1536 1537 /* If the VM is terminating, exit normally */ 1538 if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) { 1539 ret = (intptr_t)NULL; 1540 break; 1541 } 1542 1543 if (vrp->vrp_exit_reason != VM_EXIT_NONE) { 1544 /* 1545 * vmm(4) needs help handling an exit, handle in 1546 * vcpu_exit. 1547 */ 1548 ret = vcpu_exit(vrp); 1549 if (ret) 1550 break; 1551 } 1552 } 1553 1554 mutex_lock(&threadmutex); 1555 vcpu_done[n] = 1; 1556 pthread_cond_signal(&threadcond); 1557 mutex_unlock(&threadmutex); 1558 1559 return ((void *)ret); 1560 } 1561 1562 int 1563 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) 1564 { 1565 struct vm_intr_params vip; 1566 1567 memset(&vip, 0, sizeof(vip)); 1568 1569 vip.vip_vm_id = vm_id; 1570 vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */ 1571 vip.vip_intr = intr; 1572 1573 if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1) 1574 return (errno); 1575 1576 return (0); 1577 } 1578 1579 /* 1580 * vcpu_exit_pci 1581 * 1582 * Handle all I/O to the emulated PCI subsystem. 1583 * 1584 * Parameters: 1585 * vrp: vcpu run paramters containing guest state for this exit 1586 * 1587 * Return value: 1588 * Interrupt to inject to the guest VM, or 0xFF if no interrupt should 1589 * be injected. 1590 */ 1591 uint8_t 1592 vcpu_exit_pci(struct vm_run_params *vrp) 1593 { 1594 struct vm_exit *vei = vrp->vrp_exit; 1595 uint8_t intr; 1596 1597 intr = 0xFF; 1598 1599 switch (vei->vei.vei_port) { 1600 case PCI_MODE1_ADDRESS_REG: 1601 pci_handle_address_reg(vrp); 1602 break; 1603 case PCI_MODE1_DATA_REG: 1604 case PCI_MODE1_DATA_REG + 1: 1605 case PCI_MODE1_DATA_REG + 2: 1606 case PCI_MODE1_DATA_REG + 3: 1607 pci_handle_data_reg(vrp); 1608 break; 1609 case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: 1610 intr = pci_handle_io(vrp); 1611 break; 1612 default: 1613 log_warnx("%s: unknown PCI register 0x%llx", 1614 __progname, (uint64_t)vei->vei.vei_port); 1615 break; 1616 } 1617 1618 return (intr); 1619 } 1620 1621 /* 1622 * vcpu_exit_inout 1623 * 1624 * Handle all I/O exits that need to be emulated in vmd. This includes the 1625 * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. 1626 * 1627 * Parameters: 1628 * vrp: vcpu run parameters containing guest state for this exit 1629 */ 1630 void 1631 vcpu_exit_inout(struct vm_run_params *vrp) 1632 { 1633 struct vm_exit *vei = vrp->vrp_exit; 1634 uint8_t intr = 0xFF; 1635 1636 if (ioports_map[vei->vei.vei_port] != NULL) 1637 intr = ioports_map[vei->vei.vei_port](vrp); 1638 else if (vei->vei.vei_dir == VEI_DIR_IN) 1639 set_return_data(vei, 0xFFFFFFFF); 1640 1641 if (intr != 0xFF) 1642 vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); 1643 } 1644 1645 /* 1646 * vcpu_exit_eptviolation 1647 * 1648 * handle an EPT Violation 1649 * 1650 * Parameters: 1651 * vrp: vcpu run parameters containing guest state for this exit 1652 * 1653 * Return values: 1654 * 0: no action required 1655 * EAGAIN: a protection fault occured, kill the vm. 1656 */ 1657 int 1658 vcpu_exit_eptviolation(struct vm_run_params *vrp) 1659 { 1660 struct vm_exit *ve = vrp->vrp_exit; 1661 1662 /* 1663 * vmd may be exiting to vmd to handle a pending interrupt 1664 * but last exit type may have been VMX_EXIT_EPT_VIOLATION, 1665 * check the fault_type to ensure we really are processing 1666 * a VMX_EXIT_EPT_VIOLATION. 1667 */ 1668 if (ve->vee.vee_fault_type == VEE_FAULT_PROTECT) { 1669 log_debug("%s: EPT Violation: rip=0x%llx", 1670 __progname, vrp->vrp_exit->vrs.vrs_gprs[VCPU_REGS_RIP]); 1671 return (EAGAIN); 1672 } 1673 1674 return (0); 1675 } 1676 1677 /* 1678 * vcpu_exit 1679 * 1680 * Handle a vcpu exit. This function is called when it is determined that 1681 * vmm(4) requires the assistance of vmd to support a particular guest 1682 * exit type (eg, accessing an I/O port or device). Guest state is contained 1683 * in 'vrp', and will be resent to vmm(4) on exit completion. 1684 * 1685 * Upon conclusion of handling the exit, the function determines if any 1686 * interrupts should be injected into the guest, and asserts the proper 1687 * IRQ line whose interrupt should be vectored. 1688 * 1689 * Parameters: 1690 * vrp: vcpu run parameters containing guest state for this exit 1691 * 1692 * Return values: 1693 * 0: the exit was handled successfully 1694 * 1: an error occurred (eg, unknown exit reason passed in 'vrp') 1695 */ 1696 int 1697 vcpu_exit(struct vm_run_params *vrp) 1698 { 1699 int ret; 1700 1701 switch (vrp->vrp_exit_reason) { 1702 case VMX_EXIT_INT_WINDOW: 1703 case SVM_VMEXIT_VINTR: 1704 case VMX_EXIT_CPUID: 1705 case VMX_EXIT_EXTINT: 1706 case SVM_VMEXIT_INTR: 1707 case SVM_VMEXIT_NPF: 1708 case SVM_VMEXIT_MSR: 1709 case SVM_VMEXIT_CPUID: 1710 /* 1711 * We may be exiting to vmd to handle a pending interrupt but 1712 * at the same time the last exit type may have been one of 1713 * these. In this case, there's nothing extra to be done 1714 * here (and falling through to the default case below results 1715 * in more vmd log spam). 1716 */ 1717 break; 1718 case VMX_EXIT_EPT_VIOLATION: 1719 ret = vcpu_exit_eptviolation(vrp); 1720 if (ret) 1721 return (ret); 1722 1723 break; 1724 case VMX_EXIT_IO: 1725 case SVM_VMEXIT_IOIO: 1726 vcpu_exit_inout(vrp); 1727 break; 1728 case VMX_EXIT_HLT: 1729 case SVM_VMEXIT_HLT: 1730 ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1731 if (ret) { 1732 log_warnx("%s: can't lock vcpu mutex (%d)", 1733 __func__, ret); 1734 return (ret); 1735 } 1736 vcpu_hlt[vrp->vrp_vcpu_id] = 1; 1737 ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1738 if (ret) { 1739 log_warnx("%s: can't unlock vcpu mutex (%d)", 1740 __func__, ret); 1741 return (ret); 1742 } 1743 break; 1744 case VMX_EXIT_TRIPLE_FAULT: 1745 case SVM_VMEXIT_SHUTDOWN: 1746 /* reset VM */ 1747 return (EAGAIN); 1748 default: 1749 log_debug("%s: unknown exit reason 0x%x", 1750 __progname, vrp->vrp_exit_reason); 1751 } 1752 1753 vrp->vrp_continue = 1; 1754 1755 return (0); 1756 } 1757 1758 /* 1759 * find_gpa_range 1760 * 1761 * Search for a contiguous guest physical mem range. 1762 * 1763 * Parameters: 1764 * vcp: VM create parameters that contain the memory map to search in 1765 * gpa: the starting guest physical address 1766 * len: the length of the memory range 1767 * 1768 * Return values: 1769 * NULL: on failure if there is no memory range as described by the parameters 1770 * Pointer to vm_mem_range that contains the start of the range otherwise. 1771 */ 1772 static struct vm_mem_range * 1773 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) 1774 { 1775 size_t i, n; 1776 struct vm_mem_range *vmr; 1777 1778 /* Find the first vm_mem_range that contains gpa */ 1779 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1780 vmr = &vcp->vcp_memranges[i]; 1781 if (gpa < vmr->vmr_gpa + vmr->vmr_size) 1782 break; 1783 } 1784 1785 /* No range found. */ 1786 if (i == vcp->vcp_nmemranges) 1787 return (NULL); 1788 1789 /* 1790 * vmr may cover the range [gpa, gpa + len) only partly. Make 1791 * sure that the following vm_mem_ranges are contiguous and 1792 * cover the rest. 1793 */ 1794 n = vmr->vmr_size - (gpa - vmr->vmr_gpa); 1795 if (len < n) 1796 len = 0; 1797 else 1798 len -= n; 1799 gpa = vmr->vmr_gpa + vmr->vmr_size; 1800 for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { 1801 vmr = &vcp->vcp_memranges[i]; 1802 if (gpa != vmr->vmr_gpa) 1803 return (NULL); 1804 if (len <= vmr->vmr_size) 1805 len = 0; 1806 else 1807 len -= vmr->vmr_size; 1808 1809 gpa = vmr->vmr_gpa + vmr->vmr_size; 1810 } 1811 1812 if (len != 0) 1813 return (NULL); 1814 1815 return (vmr); 1816 } 1817 1818 /* 1819 * write_mem 1820 * 1821 * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. 1822 * 1823 * Parameters: 1824 * dst: the destination paddr_t in the guest VM 1825 * buf: data to copy (or NULL to zero the data) 1826 * len: number of bytes to copy 1827 * 1828 * Return values: 1829 * 0: success 1830 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1831 * exist in the guest. 1832 */ 1833 int 1834 write_mem(paddr_t dst, const void *buf, size_t len) 1835 { 1836 const char *from = buf; 1837 char *to; 1838 size_t n, off; 1839 struct vm_mem_range *vmr; 1840 1841 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); 1842 if (vmr == NULL) { 1843 errno = EINVAL; 1844 log_warn("%s: failed - invalid memory range dst = 0x%lx, " 1845 "len = 0x%zx", __func__, dst, len); 1846 return (EINVAL); 1847 } 1848 1849 off = dst - vmr->vmr_gpa; 1850 while (len != 0) { 1851 n = vmr->vmr_size - off; 1852 if (len < n) 1853 n = len; 1854 1855 to = (char *)vmr->vmr_va + off; 1856 if (buf == NULL) 1857 memset(to, 0, n); 1858 else { 1859 memcpy(to, from, n); 1860 from += n; 1861 } 1862 len -= n; 1863 off = 0; 1864 vmr++; 1865 } 1866 1867 return (0); 1868 } 1869 1870 /* 1871 * read_mem 1872 * 1873 * Reads memory at guest paddr 'src' into 'buf'. 1874 * 1875 * Parameters: 1876 * src: the source paddr_t in the guest VM to read from. 1877 * buf: destination (local) buffer 1878 * len: number of bytes to read 1879 * 1880 * Return values: 1881 * 0: success 1882 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1883 * exist in the guest. 1884 */ 1885 int 1886 read_mem(paddr_t src, void *buf, size_t len) 1887 { 1888 char *from, *to = buf; 1889 size_t n, off; 1890 struct vm_mem_range *vmr; 1891 1892 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); 1893 if (vmr == NULL) { 1894 errno = EINVAL; 1895 log_warn("%s: failed - invalid memory range src = 0x%lx, " 1896 "len = 0x%zx", __func__, src, len); 1897 return (EINVAL); 1898 } 1899 1900 off = src - vmr->vmr_gpa; 1901 while (len != 0) { 1902 n = vmr->vmr_size - off; 1903 if (len < n) 1904 n = len; 1905 1906 from = (char *)vmr->vmr_va + off; 1907 memcpy(to, from, n); 1908 1909 to += n; 1910 len -= n; 1911 off = 0; 1912 vmr++; 1913 } 1914 1915 return (0); 1916 } 1917 1918 /* 1919 * vcpu_assert_pic_irq 1920 * 1921 * Injects the specified IRQ on the supplied vcpu/vm 1922 * 1923 * Parameters: 1924 * vm_id: VM ID to inject to 1925 * vcpu_id: VCPU ID to inject to 1926 * irq: IRQ to inject 1927 */ 1928 void 1929 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 1930 { 1931 int ret; 1932 1933 i8259_assert_irq(irq); 1934 1935 if (i8259_is_pending()) { 1936 if (vcpu_pic_intr(vm_id, vcpu_id, 1)) 1937 fatalx("%s: can't assert INTR", __func__); 1938 1939 ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]); 1940 if (ret) 1941 fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret); 1942 1943 vcpu_hlt[vcpu_id] = 0; 1944 ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); 1945 if (ret) 1946 fatalx("%s: can't signal (%d)", __func__, ret); 1947 ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]); 1948 if (ret) 1949 fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret); 1950 } 1951 } 1952 1953 /* 1954 * vcpu_deassert_pic_irq 1955 * 1956 * Clears the specified IRQ on the supplied vcpu/vm 1957 * 1958 * Parameters: 1959 * vm_id: VM ID to clear in 1960 * vcpu_id: VCPU ID to clear in 1961 * irq: IRQ to clear 1962 */ 1963 void 1964 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 1965 { 1966 i8259_deassert_irq(irq); 1967 1968 if (!i8259_is_pending()) { 1969 if (vcpu_pic_intr(vm_id, vcpu_id, 0)) 1970 fatalx("%s: can't deassert INTR for vm_id %d, " 1971 "vcpu_id %d", __func__, vm_id, vcpu_id); 1972 } 1973 } 1974 1975 /* 1976 * fd_hasdata 1977 * 1978 * Determines if data can be read from a file descriptor. 1979 * 1980 * Parameters: 1981 * fd: the fd to check 1982 * 1983 * Return values: 1984 * 1 if data can be read from an fd, or 0 otherwise. 1985 */ 1986 int 1987 fd_hasdata(int fd) 1988 { 1989 struct pollfd pfd[1]; 1990 int nready, hasdata = 0; 1991 1992 pfd[0].fd = fd; 1993 pfd[0].events = POLLIN; 1994 nready = poll(pfd, 1, 0); 1995 if (nready == -1) 1996 log_warn("checking file descriptor for data failed"); 1997 else if (nready == 1 && pfd[0].revents & POLLIN) 1998 hasdata = 1; 1999 return (hasdata); 2000 } 2001 2002 /* 2003 * mutex_lock 2004 * 2005 * Wrapper function for pthread_mutex_lock that does error checking and that 2006 * exits on failure 2007 */ 2008 void 2009 mutex_lock(pthread_mutex_t *m) 2010 { 2011 int ret; 2012 2013 ret = pthread_mutex_lock(m); 2014 if (ret) { 2015 errno = ret; 2016 fatal("could not acquire mutex"); 2017 } 2018 } 2019 2020 /* 2021 * mutex_unlock 2022 * 2023 * Wrapper function for pthread_mutex_unlock that does error checking and that 2024 * exits on failure 2025 */ 2026 void 2027 mutex_unlock(pthread_mutex_t *m) 2028 { 2029 int ret; 2030 2031 ret = pthread_mutex_unlock(m); 2032 if (ret) { 2033 errno = ret; 2034 fatal("could not release mutex"); 2035 } 2036 } 2037 2038 /* 2039 * set_return_data 2040 * 2041 * Utility function for manipulating register data in vm exit info structs. This 2042 * function ensures that the data is copied to the vei->vei.vei_data field with 2043 * the proper size for the operation being performed. 2044 * 2045 * Parameters: 2046 * vei: exit information 2047 * data: return data 2048 */ 2049 void 2050 set_return_data(struct vm_exit *vei, uint32_t data) 2051 { 2052 switch (vei->vei.vei_size) { 2053 case 1: 2054 vei->vei.vei_data &= ~0xFF; 2055 vei->vei.vei_data |= (uint8_t)data; 2056 break; 2057 case 2: 2058 vei->vei.vei_data &= ~0xFFFF; 2059 vei->vei.vei_data |= (uint16_t)data; 2060 break; 2061 case 4: 2062 vei->vei.vei_data = data; 2063 break; 2064 } 2065 } 2066 2067 /* 2068 * get_input_data 2069 * 2070 * Utility function for manipulating register data in vm exit info 2071 * structs. This function ensures that the data is copied from the 2072 * vei->vei.vei_data field with the proper size for the operation being 2073 * performed. 2074 * 2075 * Parameters: 2076 * vei: exit information 2077 * data: location to store the result 2078 */ 2079 void 2080 get_input_data(struct vm_exit *vei, uint32_t *data) 2081 { 2082 switch (vei->vei.vei_size) { 2083 case 1: 2084 *data &= 0xFFFFFF00; 2085 *data |= (uint8_t)vei->vei.vei_data; 2086 break; 2087 case 2: 2088 *data &= 0xFFFF0000; 2089 *data |= (uint16_t)vei->vei.vei_data; 2090 break; 2091 case 4: 2092 *data = vei->vei.vei_data; 2093 break; 2094 default: 2095 log_warnx("%s: invalid i/o size %d", __func__, 2096 vei->vei.vei_size); 2097 } 2098 2099 } 2100 2101 /* 2102 * translate_gva 2103 * 2104 * Translates a guest virtual address to a guest physical address by walking 2105 * the currently active page table (if needed). 2106 * 2107 * Note - this function can possibly alter the supplied VCPU state. 2108 * Specifically, it may inject exceptions depending on the current VCPU 2109 * configuration, and may alter %cr2 on #PF. Consequently, this function 2110 * should only be used as part of instruction emulation. 2111 * 2112 * Parameters: 2113 * exit: The VCPU this translation should be performed for (guest MMU settings 2114 * are gathered from this VCPU) 2115 * va: virtual address to translate 2116 * pa: pointer to paddr_t variable that will receive the translated physical 2117 * address. 'pa' is unchanged on error. 2118 * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which 2119 * the address should be translated 2120 * 2121 * Return values: 2122 * 0: the address was successfully translated - 'pa' contains the physical 2123 * address currently mapped by 'va'. 2124 * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case 2125 * and %cr2 set in the vcpu structure. 2126 * EINVAL: an error occurred reading paging table structures 2127 */ 2128 int 2129 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode) 2130 { 2131 int level, shift, pdidx; 2132 uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask; 2133 uint64_t shift_width, pte_size; 2134 struct vcpu_reg_state *vrs; 2135 2136 vrs = &exit->vrs; 2137 2138 if (!pa) 2139 return (EINVAL); 2140 2141 if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) { 2142 log_debug("%s: unpaged, va=pa=0x%llx", __func__, va); 2143 *pa = va; 2144 return (0); 2145 } 2146 2147 pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3]; 2148 2149 log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__, 2150 vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]); 2151 2152 if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) { 2153 if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) { 2154 pte_size = sizeof(uint64_t); 2155 shift_width = 9; 2156 2157 if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) { 2158 /* 4 level paging */ 2159 level = 4; 2160 mask = L4_MASK; 2161 shift = L4_SHIFT; 2162 } else { 2163 /* 32 bit with PAE paging */ 2164 level = 3; 2165 mask = L3_MASK; 2166 shift = L3_SHIFT; 2167 } 2168 } else { 2169 /* 32 bit paging */ 2170 level = 2; 2171 shift_width = 10; 2172 mask = 0xFFC00000; 2173 shift = 22; 2174 pte_size = sizeof(uint32_t); 2175 } 2176 } else 2177 return (EINVAL); 2178 2179 /* XXX: Check for R bit in segment selector and set A bit */ 2180 2181 for (;level > 0; level--) { 2182 pdidx = (va & mask) >> shift; 2183 pte_paddr = (pt_paddr) + (pdidx * pte_size); 2184 2185 log_debug("%s: read pte level %d @ GPA 0x%llx", __func__, 2186 level, pte_paddr); 2187 if (read_mem(pte_paddr, &pte, pte_size)) { 2188 log_warn("%s: failed to read pte", __func__); 2189 return (EFAULT); 2190 } 2191 2192 log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr, 2193 pte); 2194 2195 /* XXX: Set CR2 */ 2196 if (!(pte & PG_V)) 2197 return (EFAULT); 2198 2199 /* XXX: Check for SMAP */ 2200 if ((mode == PROT_WRITE) && !(pte & PG_RW)) 2201 return (EPERM); 2202 2203 if ((exit->cpl > 0) && !(pte & PG_u)) 2204 return (EPERM); 2205 2206 pte = pte | PG_U; 2207 if (mode == PROT_WRITE) 2208 pte = pte | PG_M; 2209 if (write_mem(pte_paddr, &pte, pte_size)) { 2210 log_warn("%s: failed to write back flags to pte", 2211 __func__); 2212 return (EIO); 2213 } 2214 2215 /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */ 2216 if (pte & PG_PS) 2217 break; 2218 2219 if (level > 1) { 2220 pt_paddr = pte & PG_FRAME; 2221 shift -= shift_width; 2222 mask = mask >> shift_width; 2223 } 2224 } 2225 2226 low_mask = (1 << shift) - 1; 2227 high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask; 2228 *pa = (pte & high_mask) | (va & low_mask); 2229 2230 log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa); 2231 2232 return (0); 2233 } 2234 2235 /* 2236 * vm_pipe_init 2237 * 2238 * Initialize a vm_dev_pipe, setting up its file descriptors and its 2239 * event structure with the given callback. 2240 * 2241 * Parameters: 2242 * p: pointer to vm_dev_pipe struct to initizlize 2243 * cb: callback to use for READ events on the read end of the pipe 2244 */ 2245 void 2246 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *)) 2247 { 2248 int ret; 2249 int fds[2]; 2250 2251 memset(p, 0, sizeof(struct vm_dev_pipe)); 2252 2253 ret = pipe(fds); 2254 if (ret) 2255 fatal("failed to create vm_dev_pipe pipe"); 2256 2257 p->read = fds[0]; 2258 p->write = fds[1]; 2259 2260 event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL); 2261 } 2262 2263 /* 2264 * vm_pipe_send 2265 * 2266 * Send a message to an emulated device vie the provided vm_dev_pipe. 2267 * 2268 * Parameters: 2269 * p: pointer to initialized vm_dev_pipe 2270 * msg: message to send in the channel 2271 */ 2272 void 2273 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg) 2274 { 2275 size_t n; 2276 n = write(p->write, &msg, sizeof(msg)); 2277 if (n != sizeof(msg)) 2278 fatal("failed to write to device pipe"); 2279 } 2280 2281 /* 2282 * vm_pipe_recv 2283 * 2284 * Receive a message for an emulated device via the provided vm_dev_pipe. 2285 * Returns the message value, otherwise will exit on failure. 2286 * 2287 * Parameters: 2288 * p: pointer to initialized vm_dev_pipe 2289 * 2290 * Return values: 2291 * a value of enum pipe_msg_type or fatal exit on read(2) error 2292 */ 2293 enum pipe_msg_type 2294 vm_pipe_recv(struct vm_dev_pipe *p) 2295 { 2296 size_t n; 2297 enum pipe_msg_type msg; 2298 n = read(p->read, &msg, sizeof(msg)); 2299 if (n != sizeof(msg)) 2300 fatal("failed to read from device pipe"); 2301 2302 return msg; 2303 } 2304