1 /* $OpenBSD: vm.c,v 1.40 2018/09/28 12:35:32 reyk Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/ioctl.h> 21 #include <sys/queue.h> 22 #include <sys/wait.h> 23 #include <sys/uio.h> 24 #include <sys/socket.h> 25 #include <sys/time.h> 26 #include <sys/mman.h> 27 28 #include <dev/ic/i8253reg.h> 29 #include <dev/isa/isareg.h> 30 #include <dev/pci/pcireg.h> 31 32 #include <machine/param.h> 33 #include <machine/psl.h> 34 #include <machine/pte.h> 35 #include <machine/specialreg.h> 36 #include <machine/vmmvar.h> 37 38 #include <net/if.h> 39 40 #include <errno.h> 41 #include <event.h> 42 #include <fcntl.h> 43 #include <imsg.h> 44 #include <limits.h> 45 #include <poll.h> 46 #include <pthread.h> 47 #include <stddef.h> 48 #include <stdio.h> 49 #include <stdlib.h> 50 #include <string.h> 51 #include <unistd.h> 52 #include <util.h> 53 54 #include "vmd.h" 55 #include "vmm.h" 56 #include "loadfile.h" 57 #include "pci.h" 58 #include "virtio.h" 59 #include "proc.h" 60 #include "i8253.h" 61 #include "i8259.h" 62 #include "ns8250.h" 63 #include "mc146818.h" 64 #include "atomicio.h" 65 66 io_fn_t ioports_map[MAX_PORTS]; 67 68 int run_vm(int, int *, int *, struct vmop_create_params *, 69 struct vcpu_reg_state *); 70 void vm_dispatch_vmm(int, short, void *); 71 void *event_thread(void *); 72 void *vcpu_run_loop(void *); 73 int vcpu_exit(struct vm_run_params *); 74 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); 75 void create_memory_map(struct vm_create_params *); 76 int alloc_guest_mem(struct vm_create_params *); 77 int vmm_create_vm(struct vm_create_params *); 78 void init_emulated_hw(struct vmop_create_params *, int, int *, int *); 79 void restore_emulated_hw(struct vm_create_params *, int, int *, int *,int); 80 void vcpu_exit_inout(struct vm_run_params *); 81 uint8_t vcpu_exit_pci(struct vm_run_params *); 82 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); 83 int loadfile_bios(FILE *, struct vcpu_reg_state *); 84 int send_vm(int, struct vm_create_params *); 85 int dump_send_header(int); 86 int dump_vmr(int , struct vm_mem_range *); 87 int dump_mem(int, struct vm_create_params *); 88 void restore_vmr(int, struct vm_mem_range *); 89 void restore_mem(int, struct vm_create_params *); 90 void pause_vm(struct vm_create_params *); 91 void unpause_vm(struct vm_create_params *); 92 93 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, 94 size_t); 95 96 int con_fd; 97 struct vmd_vm *current_vm; 98 99 extern struct vmd *env; 100 101 extern char *__progname; 102 103 pthread_mutex_t threadmutex; 104 pthread_cond_t threadcond; 105 106 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM]; 107 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM]; 108 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM]; 109 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM]; 110 111 /* 112 * Represents a standard register set for an OS to be booted 113 * as a flat 64 bit address space. 114 * 115 * NOT set here are: 116 * RIP 117 * RSP 118 * GDTR BASE 119 * 120 * Specific bootloaders should clone this structure and override 121 * those fields as needed. 122 * 123 * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on 124 * features of the CPU in use. 125 */ 126 static const struct vcpu_reg_state vcpu_init_flat64 = { 127 #ifdef __i386__ 128 .vrs_gprs[VCPU_REGS_EFLAGS] = 0x2, 129 .vrs_gprs[VCPU_REGS_EIP] = 0x0, 130 .vrs_gprs[VCPU_REGS_ESP] = 0x0, 131 #else 132 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 133 .vrs_gprs[VCPU_REGS_RIP] = 0x0, 134 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 135 #endif 136 .vrs_crs[VCPU_REGS_CR0] = CR0_CD | CR0_NW | CR0_ET | CR0_PE | CR0_PG, 137 .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE, 138 .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE, 139 .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL, 140 .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL, 141 .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL, 142 .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL, 143 .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, 144 .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 145 .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 146 .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 147 .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 148 .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 149 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 150 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 151 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 152 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 153 .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA, 154 #ifndef __i386__ 155 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 156 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 157 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 158 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 159 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 160 .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL, 161 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 162 #endif 163 }; 164 165 /* 166 * Represents a standard register set for an BIOS to be booted 167 * as a flat 16 bit address space. 168 */ 169 static const struct vcpu_reg_state vcpu_init_flat16 = { 170 #ifdef __i386__ 171 .vrs_gprs[VCPU_REGS_EFLAGS] = 0x2, 172 .vrs_gprs[VCPU_REGS_EIP] = 0xFFF0, 173 .vrs_gprs[VCPU_REGS_ESP] = 0x0, 174 #else 175 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 176 .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0, 177 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 178 #endif 179 .vrs_crs[VCPU_REGS_CR0] = 0x60000010, 180 .vrs_crs[VCPU_REGS_CR3] = 0, 181 .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000}, 182 .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 183 .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0}, 184 .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 185 .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 186 .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 187 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 188 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 189 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 190 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 191 .vrs_msrs[VCPU_REGS_EFER] = 0ULL, 192 #ifndef __i386__ 193 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 194 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 195 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 196 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 197 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 198 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 199 #endif 200 }; 201 202 /* 203 * loadfile_bios 204 * 205 * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image 206 * directly into memory. 207 * 208 * Parameters: 209 * fp: file of a kernel file to load 210 * (out) vrs: register state to set on init for this kernel 211 * 212 * Return values: 213 * 0 if successful 214 * various error codes returned from read(2) or loadelf functions 215 */ 216 int 217 loadfile_bios(FILE *fp, struct vcpu_reg_state *vrs) 218 { 219 off_t size, off; 220 221 /* Set up a "flat 16 bit" register state for BIOS */ 222 memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs)); 223 224 /* Get the size of the BIOS image and seek to the beginning */ 225 if (fseeko(fp, 0, SEEK_END) == -1 || (size = ftello(fp)) == -1 || 226 fseeko(fp, 0, SEEK_SET) == -1) 227 return (-1); 228 229 /* The BIOS image must end at 1M */ 230 if ((off = 1048576 - size) < 0) 231 return (-1); 232 233 /* Read BIOS image into memory */ 234 if (mread(fp, off, size) != (size_t)size) { 235 errno = EIO; 236 return (-1); 237 } 238 239 log_debug("%s: loaded BIOS image", __func__); 240 241 return (0); 242 } 243 244 /* 245 * start_vm 246 * 247 * After forking a new VM process, starts the new VM with the creation 248 * parameters supplied (in the incoming vm->vm_params field). This 249 * function performs a basic sanity check on the incoming parameters 250 * and then performs the following steps to complete the creation of the VM: 251 * 252 * 1. validates and create the new VM 253 * 2. opens the imsg control channel to the parent and drops more privilege 254 * 3. drops additional privleges by calling pledge(2) 255 * 4. loads the kernel from the disk image or file descriptor 256 * 5. runs the VM's VCPU loops. 257 * 258 * Parameters: 259 * vm: The VM data structure that is including the VM create parameters. 260 * fd: The imsg socket that is connected to the parent process. 261 * 262 * Return values: 263 * 0: success 264 * !0 : failure - typically an errno indicating the source of the failure 265 */ 266 int 267 start_vm(struct vmd_vm *vm, int fd) 268 { 269 struct vmop_create_params *vmc = &vm->vm_params; 270 struct vm_create_params *vcp = &vmc->vmc_params; 271 struct vcpu_reg_state vrs; 272 int nicfds[VMM_MAX_NICS_PER_VM]; 273 int ret; 274 FILE *fp; 275 struct vmboot_params vmboot; 276 size_t i; 277 struct vm_rwregs_params vrp; 278 279 /* Child */ 280 setproctitle("%s", vcp->vcp_name); 281 log_procinit(vcp->vcp_name); 282 283 if (!vm->vm_received) 284 create_memory_map(vcp); 285 286 ret = alloc_guest_mem(vcp); 287 288 if (ret) { 289 errno = ret; 290 fatal("could not allocate guest memory - exiting"); 291 } 292 293 ret = vmm_create_vm(vcp); 294 current_vm = vm; 295 296 /* send back the kernel-generated vm id (0 on error) */ 297 if (write(fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != 298 sizeof(vcp->vcp_id)) 299 fatal("write vcp id"); 300 301 if (ret) { 302 errno = ret; 303 fatal("create vmm ioctl failed - exiting"); 304 } 305 306 /* 307 * pledge in the vm processes: 308 * stdio - for malloc and basic I/O including events. 309 * recvfd - for send/recv. 310 * vmm - for the vmm ioctls and operations. 311 */ 312 if (pledge("stdio vmm recvfd", NULL) == -1) 313 fatal("pledge"); 314 315 if (vm->vm_received) { 316 ret = read(vm->vm_receive_fd, &vrp, sizeof(vrp)); 317 if (ret != sizeof(vrp)) { 318 fatal("received incomplete vrp - exiting"); 319 } 320 vrs = vrp.vrwp_regs; 321 } else { 322 /* 323 * Set up default "flat 64 bit" register state - RIP, 324 * RSP, and GDT info will be set in bootloader 325 */ 326 memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs)); 327 328 /* Find and open kernel image */ 329 if ((fp = vmboot_open(vm->vm_kernel, 330 vm->vm_disks[0], vmc->vmc_disktypes[0], &vmboot)) == NULL) 331 fatalx("failed to open kernel - exiting"); 332 333 /* Load kernel image */ 334 ret = loadfile_elf(fp, vcp, &vrs, 335 vmboot.vbp_bootdev, vmboot.vbp_howto); 336 337 /* 338 * Try BIOS as a fallback (only if it was provided as an image 339 * with vm->vm_kernel and not loaded from the disk) 340 */ 341 if (ret && errno == ENOEXEC && vm->vm_kernel != -1) 342 ret = loadfile_bios(fp, &vrs); 343 344 if (ret) 345 fatal("failed to load kernel or BIOS - exiting"); 346 347 vmboot_close(fp, &vmboot); 348 } 349 350 if (vm->vm_kernel != -1) 351 close(vm->vm_kernel); 352 353 con_fd = vm->vm_tty; 354 if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) 355 fatal("failed to set nonblocking mode on console"); 356 357 for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) 358 nicfds[i] = vm->vm_ifs[i].vif_fd; 359 360 event_init(); 361 362 if (vm->vm_received) { 363 restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds, 364 vm->vm_disks, vm->vm_cdrom); 365 mc146818_start(); 366 restore_mem(vm->vm_receive_fd, vcp); 367 } 368 369 if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) 370 fatal("setup vm pipe"); 371 372 /* Execute the vcpu run loop(s) for this VM */ 373 ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs); 374 375 /* Ensure that any in-flight data is written back */ 376 virtio_shutdown(vm); 377 378 return (ret); 379 } 380 381 /* 382 * vm_dispatch_vmm 383 * 384 * imsg callback for messages that are received from the vmm parent process. 385 */ 386 void 387 vm_dispatch_vmm(int fd, short event, void *arg) 388 { 389 struct vmd_vm *vm = arg; 390 struct vmop_result vmr; 391 struct imsgev *iev = &vm->vm_iev; 392 struct imsgbuf *ibuf = &iev->ibuf; 393 struct imsg imsg; 394 ssize_t n; 395 int verbose; 396 397 if (event & EV_READ) { 398 if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) 399 fatal("%s: imsg_read", __func__); 400 if (n == 0) 401 _exit(0); 402 } 403 404 if (event & EV_WRITE) { 405 if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) 406 fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd); 407 if (n == 0) 408 _exit(0); 409 } 410 411 for (;;) { 412 if ((n = imsg_get(ibuf, &imsg)) == -1) 413 fatal("%s: imsg_get", __func__); 414 if (n == 0) 415 break; 416 417 #if DEBUG > 1 418 log_debug("%s: got imsg %d from %s", 419 __func__, imsg.hdr.type, 420 vm->vm_params.vmc_params.vcp_name); 421 #endif 422 423 switch (imsg.hdr.type) { 424 case IMSG_CTL_VERBOSE: 425 IMSG_SIZE_CHECK(&imsg, &verbose); 426 memcpy(&verbose, imsg.data, sizeof(verbose)); 427 log_setverbose(verbose); 428 break; 429 case IMSG_VMDOP_VM_SHUTDOWN: 430 if (vmmci_ctl(VMMCI_SHUTDOWN) == -1) 431 _exit(0); 432 break; 433 case IMSG_VMDOP_VM_REBOOT: 434 if (vmmci_ctl(VMMCI_REBOOT) == -1) 435 _exit(0); 436 break; 437 case IMSG_VMDOP_PAUSE_VM: 438 vmr.vmr_result = 0; 439 vmr.vmr_id = vm->vm_vmid; 440 pause_vm(&vm->vm_params.vmc_params); 441 imsg_compose_event(&vm->vm_iev, 442 IMSG_VMDOP_PAUSE_VM_RESPONSE, 443 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 444 sizeof(vmr)); 445 break; 446 case IMSG_VMDOP_UNPAUSE_VM: 447 vmr.vmr_result = 0; 448 vmr.vmr_id = vm->vm_vmid; 449 unpause_vm(&vm->vm_params.vmc_params); 450 imsg_compose_event(&vm->vm_iev, 451 IMSG_VMDOP_UNPAUSE_VM_RESPONSE, 452 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 453 sizeof(vmr)); 454 break; 455 case IMSG_VMDOP_SEND_VM_REQUEST: 456 vmr.vmr_id = vm->vm_vmid; 457 vmr.vmr_result = send_vm(imsg.fd, 458 &vm->vm_params.vmc_params); 459 imsg_compose_event(&vm->vm_iev, 460 IMSG_VMDOP_SEND_VM_RESPONSE, 461 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 462 sizeof(vmr)); 463 break; 464 default: 465 fatalx("%s: got invalid imsg %d from %s", 466 __func__, imsg.hdr.type, 467 vm->vm_params.vmc_params.vcp_name); 468 } 469 imsg_free(&imsg); 470 } 471 imsg_event_add(iev); 472 } 473 474 /* 475 * vm_ctl 476 * 477 * Tell the vmm parent process to shutdown or reboot the VM and exit. 478 */ 479 __dead void 480 vm_shutdown(unsigned int cmd) 481 { 482 switch (cmd) { 483 case VMMCI_NONE: 484 case VMMCI_SHUTDOWN: 485 (void)imsg_compose_event(¤t_vm->vm_iev, 486 IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0); 487 break; 488 case VMMCI_REBOOT: 489 (void)imsg_compose_event(¤t_vm->vm_iev, 490 IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0); 491 break; 492 default: 493 fatalx("invalid vm ctl command: %d", cmd); 494 } 495 imsg_flush(¤t_vm->vm_iev.ibuf); 496 497 _exit(0); 498 } 499 500 int 501 send_vm(int fd, struct vm_create_params *vcp) 502 { 503 struct vm_rwregs_params vrp; 504 struct vmop_create_params *vmc; 505 struct vm_terminate_params vtp; 506 unsigned int flags = 0; 507 unsigned int i; 508 int ret = 0; 509 size_t sz; 510 511 if (dump_send_header(fd)) { 512 log_info("%s: failed to send vm dump header", __func__); 513 goto err; 514 } 515 516 pause_vm(vcp); 517 518 vmc = calloc(1, sizeof(struct vmop_create_params)); 519 if (vmc == NULL) { 520 log_warn("%s: calloc error geting vmc", __func__); 521 ret = -1; 522 goto err; 523 } 524 525 flags |= VMOP_CREATE_MEMORY; 526 memcpy(&vmc->vmc_params, ¤t_vm->vm_params, sizeof(struct 527 vmop_create_params)); 528 vmc->vmc_flags = flags; 529 vrp.vrwp_vm_id = vcp->vcp_id; 530 vrp.vrwp_mask = VM_RWREGS_ALL; 531 532 sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params)); 533 if (sz != sizeof(struct vmop_create_params)) { 534 ret = -1; 535 goto err; 536 } 537 538 for (i = 0; i < vcp->vcp_ncpus; i++) { 539 vrp.vrwp_vcpu_id = i; 540 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) { 541 log_warn("%s: readregs failed", __func__); 542 goto err; 543 } 544 545 sz = atomicio(vwrite, fd, &vrp, 546 sizeof(struct vm_rwregs_params)); 547 if (sz != sizeof(struct vm_rwregs_params)) { 548 log_warn("%s: dumping registers failed", __func__); 549 ret = -1; 550 goto err; 551 } 552 } 553 554 if ((ret = i8253_dump(fd))) 555 goto err; 556 if ((ret = i8259_dump(fd))) 557 goto err; 558 if ((ret = ns8250_dump(fd))) 559 goto err; 560 if ((ret = mc146818_dump(fd))) 561 goto err; 562 if ((ret = pci_dump(fd))) 563 goto err; 564 if ((ret = virtio_dump(fd))) 565 goto err; 566 if ((ret = dump_mem(fd, vcp))) 567 goto err; 568 569 vtp.vtp_vm_id = vcp->vcp_id; 570 if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) < 0) { 571 log_warnx("%s: term IOC error: %d, %d", __func__, 572 errno, ENOENT); 573 } 574 err: 575 close(fd); 576 if (ret) 577 unpause_vm(vcp); 578 return ret; 579 } 580 581 int 582 dump_send_header(int fd) { 583 struct vm_dump_header vmh; 584 int i; 585 586 vmh.vmh_cpuids[0].code = 0x00; 587 vmh.vmh_cpuids[0].leaf = 0x00; 588 589 vmh.vmh_cpuids[1].code = 0x01; 590 vmh.vmh_cpuids[1].leaf = 0x00; 591 592 vmh.vmh_cpuids[2].code = 0x07; 593 vmh.vmh_cpuids[2].leaf = 0x00; 594 595 vmh.vmh_cpuids[3].code = 0x0d; 596 vmh.vmh_cpuids[3].leaf = 0x00; 597 598 vmh.vmh_cpuids[4].code = 0x80000001; 599 vmh.vmh_cpuids[4].leaf = 0x00; 600 601 vmh.vmh_version = VM_DUMP_VERSION; 602 603 for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { 604 CPUID_LEAF(vmh.vmh_cpuids[i].code, 605 vmh.vmh_cpuids[i].leaf, 606 vmh.vmh_cpuids[i].a, 607 vmh.vmh_cpuids[i].b, 608 vmh.vmh_cpuids[i].c, 609 vmh.vmh_cpuids[i].d); 610 } 611 612 if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh)) 613 return (-1); 614 615 return (0); 616 } 617 618 int 619 dump_mem(int fd, struct vm_create_params *vcp) 620 { 621 unsigned int i; 622 int ret; 623 struct vm_mem_range *vmr; 624 625 for (i = 0; i < vcp->vcp_nmemranges; i++) { 626 vmr = &vcp->vcp_memranges[i]; 627 ret = dump_vmr(fd, vmr); 628 if (ret) 629 return ret; 630 } 631 return (0); 632 } 633 634 void 635 restore_mem(int fd, struct vm_create_params *vcp) 636 { 637 unsigned int i; 638 struct vm_mem_range *vmr; 639 640 for (i = 0; i < vcp->vcp_nmemranges; i++) { 641 vmr = &vcp->vcp_memranges[i]; 642 restore_vmr(fd, vmr); 643 } 644 } 645 646 int 647 dump_vmr(int fd, struct vm_mem_range *vmr) 648 { 649 size_t rem = vmr->vmr_size, read=0; 650 char buf[PAGE_SIZE]; 651 652 while (rem > 0) { 653 if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) { 654 log_warn("failed to read vmr"); 655 return (-1); 656 } 657 if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) { 658 log_warn("failed to dump vmr"); 659 return (-1); 660 } 661 rem = rem - PAGE_SIZE; 662 read = read + PAGE_SIZE; 663 } 664 return (0); 665 } 666 667 void 668 restore_vmr(int fd, struct vm_mem_range *vmr) 669 { 670 size_t rem = vmr->vmr_size, wrote=0; 671 char buf[PAGE_SIZE]; 672 673 while (rem > 0) { 674 if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf)) 675 fatal("failed to restore vmr"); 676 if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE)) 677 fatal("failed to write vmr"); 678 rem = rem - PAGE_SIZE; 679 wrote = wrote + PAGE_SIZE; 680 } 681 } 682 683 void 684 pause_vm(struct vm_create_params *vcp) 685 { 686 if (current_vm->vm_paused) 687 return; 688 689 current_vm->vm_paused = 1; 690 691 /* XXX: vcpu_run_loop is running in another thread and we have to wait 692 * for the vm to exit before returning */ 693 sleep(1); 694 695 i8253_stop(); 696 mc146818_stop(); 697 } 698 699 void 700 unpause_vm(struct vm_create_params *vcp) 701 { 702 unsigned int n; 703 if (!current_vm->vm_paused) 704 return; 705 706 current_vm->vm_paused = 0; 707 708 i8253_start(); 709 mc146818_start(); 710 for (n = 0; n <= vcp->vcp_ncpus; n++) 711 pthread_cond_broadcast(&vcpu_run_cond[n]); 712 } 713 714 /* 715 * vcpu_reset 716 * 717 * Requests vmm(4) to reset the VCPUs in the indicated VM to 718 * the register state provided 719 * 720 * Parameters 721 * vmid: VM ID to reset 722 * vcpu_id: VCPU ID to reset 723 * vrs: the register state to initialize 724 * 725 * Return values: 726 * 0: success 727 * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not 728 * valid) 729 */ 730 int 731 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) 732 { 733 struct vm_resetcpu_params vrp; 734 735 memset(&vrp, 0, sizeof(vrp)); 736 vrp.vrp_vm_id = vmid; 737 vrp.vrp_vcpu_id = vcpu_id; 738 memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state)); 739 740 log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid); 741 742 if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) < 0) 743 return (errno); 744 745 return (0); 746 } 747 748 /* 749 * create_memory_map 750 * 751 * Sets up the guest physical memory ranges that the VM can access. 752 * 753 * Parameters: 754 * vcp: VM create parameters describing the VM whose memory map 755 * is being created 756 * 757 * Return values: 758 * nothing 759 */ 760 void 761 create_memory_map(struct vm_create_params *vcp) 762 { 763 size_t len, mem_bytes, mem_mb; 764 765 mem_mb = vcp->vcp_memranges[0].vmr_size; 766 vcp->vcp_nmemranges = 0; 767 if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE) 768 return; 769 770 mem_bytes = mem_mb * 1024 * 1024; 771 772 /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ 773 len = LOWMEM_KB * 1024; 774 vcp->vcp_memranges[0].vmr_gpa = 0x0; 775 vcp->vcp_memranges[0].vmr_size = len; 776 mem_bytes -= len; 777 778 /* 779 * Second memory region: LOWMEM_KB - 1MB. 780 * 781 * N.B. - Normally ROMs or parts of video RAM are mapped here. 782 * We have to add this region, because some systems 783 * unconditionally write to 0xb8000 (VGA RAM), and 784 * we need to make sure that vmm(4) permits accesses 785 * to it. So allocate guest memory for it. 786 */ 787 len = 0x100000 - LOWMEM_KB * 1024; 788 vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; 789 vcp->vcp_memranges[1].vmr_size = len; 790 mem_bytes -= len; 791 792 /* Make sure that we do not place physical memory into MMIO ranges. */ 793 if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000) 794 len = VMM_PCI_MMIO_BAR_BASE - 0x100000; 795 else 796 len = mem_bytes; 797 798 /* Third memory region: 1MB - (1MB + len) */ 799 vcp->vcp_memranges[2].vmr_gpa = 0x100000; 800 vcp->vcp_memranges[2].vmr_size = len; 801 mem_bytes -= len; 802 803 if (mem_bytes > 0) { 804 /* Fourth memory region for the remaining memory (if any) */ 805 vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; 806 vcp->vcp_memranges[3].vmr_size = mem_bytes; 807 vcp->vcp_nmemranges = 4; 808 } else 809 vcp->vcp_nmemranges = 3; 810 } 811 812 /* 813 * alloc_guest_mem 814 * 815 * Allocates memory for the guest. 816 * Instead of doing a single allocation with one mmap(), we allocate memory 817 * separately for every range for the following reasons: 818 * - ASLR for the individual ranges 819 * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to 820 * map the single mmap'd userspace memory to the individual guest physical 821 * memory ranges, the underlying amap of the single mmap'd range would have 822 * to allocate per-page reference counters. The reason is that the 823 * individual guest physical ranges would reference the single mmap'd region 824 * only partially. However, if every guest physical range has its own 825 * corresponding mmap'd userspace allocation, there are no partial 826 * references: every guest physical range fully references an mmap'd 827 * range => no per-page reference counters have to be allocated. 828 * 829 * Return values: 830 * 0: success 831 * !0: failure - errno indicating the source of the failure 832 */ 833 int 834 alloc_guest_mem(struct vm_create_params *vcp) 835 { 836 void *p; 837 int ret; 838 size_t i, j; 839 struct vm_mem_range *vmr; 840 841 for (i = 0; i < vcp->vcp_nmemranges; i++) { 842 vmr = &vcp->vcp_memranges[i]; 843 p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, 844 MAP_PRIVATE | MAP_ANON, -1, 0); 845 if (p == MAP_FAILED) { 846 ret = errno; 847 for (j = 0; j < i; j++) { 848 vmr = &vcp->vcp_memranges[j]; 849 munmap((void *)vmr->vmr_va, vmr->vmr_size); 850 } 851 852 return (ret); 853 } 854 855 vmr->vmr_va = (vaddr_t)p; 856 } 857 858 return (0); 859 } 860 861 /* 862 * vmm_create_vm 863 * 864 * Requests vmm(4) to create a new VM using the supplied creation 865 * parameters. This operation results in the creation of the in-kernel 866 * structures for the VM, but does not start the VM's vcpu(s). 867 * 868 * Parameters: 869 * vcp: vm_create_params struct containing the VM's desired creation 870 * configuration 871 * 872 * Return values: 873 * 0: success 874 * !0 : ioctl to vmm(4) failed 875 */ 876 int 877 vmm_create_vm(struct vm_create_params *vcp) 878 { 879 /* Sanity check arguments */ 880 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 881 return (EINVAL); 882 883 if (vcp->vcp_nmemranges == 0 || 884 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 885 return (EINVAL); 886 887 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 888 return (EINVAL); 889 890 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 891 return (EINVAL); 892 893 if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) < 0) 894 return (errno); 895 896 return (0); 897 } 898 899 /* 900 * init_emulated_hw 901 * 902 * Initializes the userspace hardware emulation 903 */ 904 void 905 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, 906 int *child_disks, int *child_taps) 907 { 908 struct vm_create_params *vcp = &vmc->vmc_params; 909 int i; 910 uint64_t memlo, memhi; 911 912 /* Calculate memory size for NVRAM registers */ 913 memlo = memhi = 0; 914 if (vcp->vcp_nmemranges > 2) 915 memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000; 916 917 if (vcp->vcp_nmemranges > 3) 918 memhi = vcp->vcp_memranges[3].vmr_size; 919 920 /* Reset the IO port map */ 921 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 922 923 /* Init i8253 PIT */ 924 i8253_init(vcp->vcp_id); 925 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 926 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 927 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 928 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 929 ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc; 930 931 /* Init mc146818 RTC */ 932 mc146818_init(vcp->vcp_id, memlo, memhi); 933 ioports_map[IO_RTC] = vcpu_exit_mc146818; 934 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 935 936 /* Init master and slave PICs */ 937 i8259_init(); 938 ioports_map[IO_ICU1] = vcpu_exit_i8259; 939 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 940 ioports_map[IO_ICU2] = vcpu_exit_i8259; 941 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 942 ioports_map[ELCR0] = vcpu_exit_elcr; 943 ioports_map[ELCR1] = vcpu_exit_elcr; 944 945 /* Init ns8250 UART */ 946 ns8250_init(con_fd, vcp->vcp_id); 947 for (i = COM1_DATA; i <= COM1_SCR; i++) 948 ioports_map[i] = vcpu_exit_com; 949 950 /* Initialize PCI */ 951 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 952 ioports_map[i] = vcpu_exit_pci; 953 954 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 955 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 956 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 957 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 958 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 959 pci_init(); 960 961 /* Initialize virtio devices */ 962 virtio_init(current_vm, child_cdrom, child_disks, child_taps); 963 } 964 /* 965 * restore_emulated_hw 966 * 967 * Restores the userspace hardware emulation from fd 968 */ 969 void 970 restore_emulated_hw(struct vm_create_params *vcp, int fd, 971 int *child_taps, int *child_disks, int child_cdrom) 972 { 973 /* struct vm_create_params *vcp = &vmc->vmc_params; */ 974 int i; 975 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 976 977 /* Init i8253 PIT */ 978 i8253_restore(fd, vcp->vcp_id); 979 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 980 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 981 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 982 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 983 984 /* Init master and slave PICs */ 985 i8259_restore(fd); 986 ioports_map[IO_ICU1] = vcpu_exit_i8259; 987 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 988 ioports_map[IO_ICU2] = vcpu_exit_i8259; 989 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 990 991 /* Init ns8250 UART */ 992 ns8250_restore(fd, con_fd, vcp->vcp_id); 993 for (i = COM1_DATA; i <= COM1_SCR; i++) 994 ioports_map[i] = vcpu_exit_com; 995 996 /* Init mc146818 RTC */ 997 mc146818_restore(fd, vcp->vcp_id); 998 ioports_map[IO_RTC] = vcpu_exit_mc146818; 999 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1000 1001 /* Initialize PCI */ 1002 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 1003 ioports_map[i] = vcpu_exit_pci; 1004 1005 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1006 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1007 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1008 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1009 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1010 pci_restore(fd); 1011 virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps); 1012 } 1013 1014 /* 1015 * run_vm 1016 * 1017 * Runs the VM whose creation parameters are specified in vcp 1018 * 1019 * Parameters: 1020 * child_cdrom: previously-opened child ISO disk file descriptor 1021 * child_disks: previously-opened child VM disk file file descriptors 1022 * child_taps: previously-opened child tap file descriptors 1023 * vmc: vmop_create_params struct containing the VM's desired creation 1024 * configuration 1025 * vrs: VCPU register state to initialize 1026 * 1027 * Return values: 1028 * 0: the VM exited normally 1029 * !0 : the VM exited abnormally or failed to start 1030 */ 1031 int 1032 run_vm(int child_cdrom, int *child_disks, int *child_taps, 1033 struct vmop_create_params *vmc, struct vcpu_reg_state *vrs) 1034 { 1035 struct vm_create_params *vcp = &vmc->vmc_params; 1036 struct vm_rwregs_params vregsp; 1037 uint8_t evdone = 0; 1038 size_t i; 1039 int ret; 1040 pthread_t *tid, evtid; 1041 struct vm_run_params **vrp; 1042 void *exit_status; 1043 1044 if (vcp == NULL) 1045 return (EINVAL); 1046 1047 if (child_cdrom == -1 && strlen(vcp->vcp_cdrom)) 1048 return (EINVAL); 1049 1050 if (child_disks == NULL && vcp->vcp_ndisks != 0) 1051 return (EINVAL); 1052 1053 if (child_taps == NULL && vcp->vcp_nnics != 0) 1054 return (EINVAL); 1055 1056 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 1057 return (EINVAL); 1058 1059 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 1060 return (EINVAL); 1061 1062 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 1063 return (EINVAL); 1064 1065 if (vcp->vcp_nmemranges == 0 || 1066 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1067 return (EINVAL); 1068 1069 tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t)); 1070 vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *)); 1071 if (tid == NULL || vrp == NULL) { 1072 log_warn("%s: memory allocation error - exiting.", 1073 __progname); 1074 return (ENOMEM); 1075 } 1076 1077 log_debug("%s: initializing hardware for vm %s", __func__, 1078 vcp->vcp_name); 1079 1080 if (!current_vm->vm_received) 1081 init_emulated_hw(vmc, child_cdrom, child_disks, child_taps); 1082 1083 ret = pthread_mutex_init(&threadmutex, NULL); 1084 if (ret) { 1085 log_warn("%s: could not initialize thread state mutex", 1086 __func__); 1087 return (ret); 1088 } 1089 ret = pthread_cond_init(&threadcond, NULL); 1090 if (ret) { 1091 log_warn("%s: could not initialize thread state " 1092 "condition variable", __func__); 1093 return (ret); 1094 } 1095 1096 mutex_lock(&threadmutex); 1097 1098 log_debug("%s: starting vcpu threads for vm %s", __func__, 1099 vcp->vcp_name); 1100 1101 /* 1102 * Create and launch one thread for each VCPU. These threads may 1103 * migrate between PCPUs over time; the need to reload CPU state 1104 * in such situations is detected and performed by vmm(4) in the 1105 * kernel. 1106 */ 1107 for (i = 0 ; i < vcp->vcp_ncpus; i++) { 1108 vrp[i] = malloc(sizeof(struct vm_run_params)); 1109 if (vrp[i] == NULL) { 1110 log_warn("%s: memory allocation error - " 1111 "exiting.", __progname); 1112 /* caller will exit, so skip freeing */ 1113 return (ENOMEM); 1114 } 1115 vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit)); 1116 if (vrp[i]->vrp_exit == NULL) { 1117 log_warn("%s: memory allocation error - " 1118 "exiting.", __progname); 1119 /* caller will exit, so skip freeing */ 1120 return (ENOMEM); 1121 } 1122 vrp[i]->vrp_vm_id = vcp->vcp_id; 1123 vrp[i]->vrp_vcpu_id = i; 1124 1125 if (vcpu_reset(vcp->vcp_id, i, vrs)) { 1126 log_warnx("%s: cannot reset VCPU %zu - exiting.", 1127 __progname, i); 1128 return (EIO); 1129 } 1130 1131 /* once more because reset_cpu changes regs */ 1132 if (current_vm->vm_received) { 1133 vregsp.vrwp_vm_id = vcp->vcp_id; 1134 vregsp.vrwp_vcpu_id = i; 1135 vregsp.vrwp_regs = *vrs; 1136 vregsp.vrwp_mask = VM_RWREGS_ALL; 1137 if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS, 1138 &vregsp)) < 0) { 1139 log_warn("%s: writeregs failed", __func__); 1140 return (ret); 1141 } 1142 } 1143 1144 ret = pthread_cond_init(&vcpu_run_cond[i], NULL); 1145 if (ret) { 1146 log_warnx("%s: cannot initialize cond var (%d)", 1147 __progname, ret); 1148 return (ret); 1149 } 1150 1151 ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL); 1152 if (ret) { 1153 log_warnx("%s: cannot initialize mtx (%d)", 1154 __progname, ret); 1155 return (ret); 1156 } 1157 1158 vcpu_hlt[i] = 0; 1159 1160 /* Start each VCPU run thread at vcpu_run_loop */ 1161 ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); 1162 if (ret) { 1163 /* caller will _exit after this return */ 1164 ret = errno; 1165 log_warn("%s: could not create vcpu thread %zu", 1166 __func__, i); 1167 return (ret); 1168 } 1169 } 1170 1171 log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name); 1172 ret = pthread_create(&evtid, NULL, event_thread, &evdone); 1173 if (ret) { 1174 errno = ret; 1175 log_warn("%s: could not create event thread", __func__); 1176 return (ret); 1177 } 1178 1179 for (;;) { 1180 ret = pthread_cond_wait(&threadcond, &threadmutex); 1181 if (ret) { 1182 log_warn("%s: waiting on thread state condition " 1183 "variable failed", __func__); 1184 return (ret); 1185 } 1186 1187 /* 1188 * Did a VCPU thread exit with an error? => return the first one 1189 */ 1190 for (i = 0; i < vcp->vcp_ncpus; i++) { 1191 if (vcpu_done[i] == 0) 1192 continue; 1193 1194 if (pthread_join(tid[i], &exit_status)) { 1195 log_warn("%s: failed to join thread %zd - " 1196 "exiting", __progname, i); 1197 return (EIO); 1198 } 1199 1200 ret = (intptr_t)exit_status; 1201 } 1202 1203 /* Did the event thread exit? => return with an error */ 1204 if (evdone) { 1205 if (pthread_join(evtid, &exit_status)) { 1206 log_warn("%s: failed to join event thread - " 1207 "exiting", __progname); 1208 return (EIO); 1209 } 1210 1211 log_warnx("%s: vm %d event thread exited " 1212 "unexpectedly", __progname, vcp->vcp_id); 1213 return (EIO); 1214 } 1215 1216 /* Did all VCPU threads exit successfully? => return */ 1217 for (i = 0; i < vcp->vcp_ncpus; i++) { 1218 if (vcpu_done[i] == 0) 1219 break; 1220 } 1221 if (i == vcp->vcp_ncpus) 1222 return (ret); 1223 1224 /* Some more threads to wait for, start over */ 1225 } 1226 1227 return (ret); 1228 } 1229 1230 void * 1231 event_thread(void *arg) 1232 { 1233 uint8_t *donep = arg; 1234 intptr_t ret; 1235 1236 ret = event_dispatch(); 1237 1238 mutex_lock(&threadmutex); 1239 *donep = 1; 1240 pthread_cond_signal(&threadcond); 1241 mutex_unlock(&threadmutex); 1242 1243 return (void *)ret; 1244 } 1245 1246 /* 1247 * vcpu_run_loop 1248 * 1249 * Runs a single VCPU until vmm(4) requires help handling an exit, 1250 * or the VM terminates. 1251 * 1252 * Parameters: 1253 * arg: vcpu_run_params for the VCPU being run by this thread 1254 * 1255 * Return values: 1256 * NULL: the VCPU shutdown properly 1257 * !NULL: error processing VCPU run, or the VCPU shutdown abnormally 1258 */ 1259 void * 1260 vcpu_run_loop(void *arg) 1261 { 1262 struct vm_run_params *vrp = (struct vm_run_params *)arg; 1263 intptr_t ret = 0; 1264 int irq; 1265 uint32_t n; 1266 1267 vrp->vrp_continue = 0; 1268 n = vrp->vrp_vcpu_id; 1269 1270 for (;;) { 1271 ret = pthread_mutex_lock(&vcpu_run_mtx[n]); 1272 1273 if (ret) { 1274 log_warnx("%s: can't lock vcpu run mtx (%d)", 1275 __func__, (int)ret); 1276 return ((void *)ret); 1277 } 1278 1279 /* If we are halted or paused, wait */ 1280 if (vcpu_hlt[n]) { 1281 while (current_vm->vm_paused == 1) { 1282 ret = pthread_cond_wait(&vcpu_run_cond[n], 1283 &vcpu_run_mtx[n]); 1284 if (ret) { 1285 log_warnx( 1286 "%s: can't wait on cond (%d)", 1287 __func__, (int)ret); 1288 (void)pthread_mutex_unlock( 1289 &vcpu_run_mtx[n]); 1290 break; 1291 } 1292 } 1293 if (vcpu_hlt[n]) { 1294 ret = pthread_cond_wait(&vcpu_run_cond[n], 1295 &vcpu_run_mtx[n]); 1296 1297 if (ret) { 1298 log_warnx( 1299 "%s: can't wait on cond (%d)", 1300 __func__, (int)ret); 1301 (void)pthread_mutex_unlock( 1302 &vcpu_run_mtx[n]); 1303 break; 1304 } 1305 } 1306 } 1307 1308 ret = pthread_mutex_unlock(&vcpu_run_mtx[n]); 1309 1310 if (ret) { 1311 log_warnx("%s: can't unlock mutex on cond (%d)", 1312 __func__, (int)ret); 1313 break; 1314 } 1315 1316 if (vrp->vrp_irqready && i8259_is_pending()) { 1317 irq = i8259_ack(); 1318 vrp->vrp_irq = irq; 1319 } else 1320 vrp->vrp_irq = 0xFFFF; 1321 1322 /* Still more pending? */ 1323 if (i8259_is_pending()) { 1324 /* XXX can probably avoid ioctls here by providing intr in vrp */ 1325 if (vcpu_pic_intr(vrp->vrp_vm_id, 1326 vrp->vrp_vcpu_id, 1)) { 1327 fatal("can't set INTR"); 1328 } 1329 } else { 1330 if (vcpu_pic_intr(vrp->vrp_vm_id, 1331 vrp->vrp_vcpu_id, 0)) { 1332 fatal("can't clear INTR"); 1333 } 1334 } 1335 1336 if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) < 0) { 1337 /* If run ioctl failed, exit */ 1338 ret = errno; 1339 log_warn("%s: vm %d / vcpu %d run ioctl failed", 1340 __func__, vrp->vrp_vm_id, n); 1341 break; 1342 } 1343 1344 /* If the VM is terminating, exit normally */ 1345 if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) { 1346 ret = (intptr_t)NULL; 1347 break; 1348 } 1349 1350 if (vrp->vrp_exit_reason != VM_EXIT_NONE) { 1351 /* 1352 * vmm(4) needs help handling an exit, handle in 1353 * vcpu_exit. 1354 */ 1355 ret = vcpu_exit(vrp); 1356 if (ret) 1357 break; 1358 } 1359 } 1360 1361 mutex_lock(&threadmutex); 1362 vcpu_done[n] = 1; 1363 pthread_cond_signal(&threadcond); 1364 mutex_unlock(&threadmutex); 1365 1366 return ((void *)ret); 1367 } 1368 1369 int 1370 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) 1371 { 1372 struct vm_intr_params vip; 1373 1374 memset(&vip, 0, sizeof(vip)); 1375 1376 vip.vip_vm_id = vm_id; 1377 vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */ 1378 vip.vip_intr = intr; 1379 1380 if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) < 0) 1381 return (errno); 1382 1383 return (0); 1384 } 1385 1386 /* 1387 * vcpu_exit_pci 1388 * 1389 * Handle all I/O to the emulated PCI subsystem. 1390 * 1391 * Parameters: 1392 * vrp: vcpu run paramters containing guest state for this exit 1393 * 1394 * Return value: 1395 * Interrupt to inject to the guest VM, or 0xFF if no interrupt should 1396 * be injected. 1397 */ 1398 uint8_t 1399 vcpu_exit_pci(struct vm_run_params *vrp) 1400 { 1401 struct vm_exit *vei = vrp->vrp_exit; 1402 uint8_t intr; 1403 1404 intr = 0xFF; 1405 1406 switch (vei->vei.vei_port) { 1407 case PCI_MODE1_ADDRESS_REG: 1408 pci_handle_address_reg(vrp); 1409 break; 1410 case PCI_MODE1_DATA_REG: 1411 case PCI_MODE1_DATA_REG + 1: 1412 case PCI_MODE1_DATA_REG + 2: 1413 case PCI_MODE1_DATA_REG + 3: 1414 pci_handle_data_reg(vrp); 1415 break; 1416 case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: 1417 intr = pci_handle_io(vrp); 1418 break; 1419 default: 1420 log_warnx("%s: unknown PCI register 0x%llx", 1421 __progname, (uint64_t)vei->vei.vei_port); 1422 break; 1423 } 1424 1425 return (intr); 1426 } 1427 1428 /* 1429 * vcpu_exit_inout 1430 * 1431 * Handle all I/O exits that need to be emulated in vmd. This includes the 1432 * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. 1433 * 1434 * Parameters: 1435 * vrp: vcpu run parameters containing guest state for this exit 1436 */ 1437 void 1438 vcpu_exit_inout(struct vm_run_params *vrp) 1439 { 1440 struct vm_exit *vei = vrp->vrp_exit; 1441 uint8_t intr = 0xFF; 1442 1443 if (ioports_map[vei->vei.vei_port] != NULL) 1444 intr = ioports_map[vei->vei.vei_port](vrp); 1445 else if (vei->vei.vei_dir == VEI_DIR_IN) 1446 set_return_data(vei, 0xFFFFFFFF); 1447 1448 if (intr != 0xFF) 1449 vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); 1450 } 1451 1452 /* 1453 * vcpu_exit 1454 * 1455 * Handle a vcpu exit. This function is called when it is determined that 1456 * vmm(4) requires the assistance of vmd to support a particular guest 1457 * exit type (eg, accessing an I/O port or device). Guest state is contained 1458 * in 'vrp', and will be resent to vmm(4) on exit completion. 1459 * 1460 * Upon conclusion of handling the exit, the function determines if any 1461 * interrupts should be injected into the guest, and asserts the proper 1462 * IRQ line whose interrupt should be vectored. 1463 * 1464 * Parameters: 1465 * vrp: vcpu run parameters containing guest state for this exit 1466 * 1467 * Return values: 1468 * 0: the exit was handled successfully 1469 * 1: an error occurred (eg, unknown exit reason passed in 'vrp') 1470 */ 1471 int 1472 vcpu_exit(struct vm_run_params *vrp) 1473 { 1474 int ret; 1475 1476 switch (vrp->vrp_exit_reason) { 1477 case VMX_EXIT_INT_WINDOW: 1478 case SVM_VMEXIT_VINTR: 1479 case VMX_EXIT_CPUID: 1480 case VMX_EXIT_EXTINT: 1481 case SVM_VMEXIT_INTR: 1482 case VMX_EXIT_EPT_VIOLATION: 1483 case SVM_VMEXIT_NPF: 1484 case SVM_VMEXIT_MSR: 1485 case SVM_VMEXIT_CPUID: 1486 /* 1487 * We may be exiting to vmd to handle a pending interrupt but 1488 * at the same time the last exit type may have been one of 1489 * these. In this case, there's nothing extra to be done 1490 * here (and falling through to the default case below results 1491 * in more vmd log spam). 1492 */ 1493 break; 1494 case VMX_EXIT_IO: 1495 case SVM_VMEXIT_IOIO: 1496 vcpu_exit_inout(vrp); 1497 break; 1498 case VMX_EXIT_HLT: 1499 case SVM_VMEXIT_HLT: 1500 ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1501 if (ret) { 1502 log_warnx("%s: can't lock vcpu mutex (%d)", 1503 __func__, ret); 1504 return (ret); 1505 } 1506 vcpu_hlt[vrp->vrp_vcpu_id] = 1; 1507 ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1508 if (ret) { 1509 log_warnx("%s: can't unlock vcpu mutex (%d)", 1510 __func__, ret); 1511 return (ret); 1512 } 1513 break; 1514 case VMX_EXIT_TRIPLE_FAULT: 1515 case SVM_VMEXIT_SHUTDOWN: 1516 /* reset VM */ 1517 return (EAGAIN); 1518 default: 1519 log_debug("%s: unknown exit reason 0x%x", 1520 __progname, vrp->vrp_exit_reason); 1521 } 1522 1523 /* Process any pending traffic */ 1524 vionet_process_rx(vrp->vrp_vm_id); 1525 1526 vrp->vrp_continue = 1; 1527 1528 return (0); 1529 } 1530 1531 /* 1532 * find_gpa_range 1533 * 1534 * Search for a contiguous guest physical mem range. 1535 * 1536 * Parameters: 1537 * vcp: VM create parameters that contain the memory map to search in 1538 * gpa: the starting guest physical address 1539 * len: the length of the memory range 1540 * 1541 * Return values: 1542 * NULL: on failure if there is no memory range as described by the parameters 1543 * Pointer to vm_mem_range that contains the start of the range otherwise. 1544 */ 1545 static struct vm_mem_range * 1546 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) 1547 { 1548 size_t i, n; 1549 struct vm_mem_range *vmr; 1550 1551 /* Find the first vm_mem_range that contains gpa */ 1552 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1553 vmr = &vcp->vcp_memranges[i]; 1554 if (vmr->vmr_gpa + vmr->vmr_size >= gpa) 1555 break; 1556 } 1557 1558 /* No range found. */ 1559 if (i == vcp->vcp_nmemranges) 1560 return (NULL); 1561 1562 /* 1563 * vmr may cover the range [gpa, gpa + len) only partly. Make 1564 * sure that the following vm_mem_ranges are contiguous and 1565 * cover the rest. 1566 */ 1567 n = vmr->vmr_size - (gpa - vmr->vmr_gpa); 1568 if (len < n) 1569 len = 0; 1570 else 1571 len -= n; 1572 gpa = vmr->vmr_gpa + vmr->vmr_size; 1573 for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { 1574 vmr = &vcp->vcp_memranges[i]; 1575 if (gpa != vmr->vmr_gpa) 1576 return (NULL); 1577 if (len <= vmr->vmr_size) 1578 len = 0; 1579 else 1580 len -= vmr->vmr_size; 1581 1582 gpa = vmr->vmr_gpa + vmr->vmr_size; 1583 } 1584 1585 if (len != 0) 1586 return (NULL); 1587 1588 return (vmr); 1589 } 1590 1591 void * 1592 vaddr_mem(paddr_t gpa, size_t len) 1593 { 1594 struct vm_create_params *vcp = ¤t_vm->vm_params.vmc_params; 1595 size_t i; 1596 struct vm_mem_range *vmr; 1597 paddr_t gpend = gpa + len; 1598 1599 /* Find the first vm_mem_range that contains gpa */ 1600 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1601 vmr = &vcp->vcp_memranges[i]; 1602 if (gpa < vmr->vmr_gpa) 1603 continue; 1604 1605 if (gpend >= vmr->vmr_gpa + vmr->vmr_size) 1606 continue; 1607 1608 return ((char *)vmr->vmr_va + (gpa - vmr->vmr_gpa)); 1609 } 1610 1611 return (NULL); 1612 } 1613 1614 /* 1615 * write_mem 1616 * 1617 * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. 1618 * 1619 * Parameters: 1620 * dst: the destination paddr_t in the guest VM 1621 * buf: data to copy 1622 * len: number of bytes to copy 1623 * 1624 * Return values: 1625 * 0: success 1626 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1627 * exist in the guest. 1628 */ 1629 int 1630 write_mem(paddr_t dst, const void *buf, size_t len) 1631 { 1632 const char *from = buf; 1633 char *to; 1634 size_t n, off; 1635 struct vm_mem_range *vmr; 1636 1637 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); 1638 if (vmr == NULL) { 1639 errno = EINVAL; 1640 log_warn("%s: failed - invalid memory range dst = 0x%lx, " 1641 "len = 0x%zx", __func__, dst, len); 1642 return (EINVAL); 1643 } 1644 1645 off = dst - vmr->vmr_gpa; 1646 while (len != 0) { 1647 n = vmr->vmr_size - off; 1648 if (len < n) 1649 n = len; 1650 1651 to = (char *)vmr->vmr_va + off; 1652 memcpy(to, from, n); 1653 1654 from += n; 1655 len -= n; 1656 off = 0; 1657 vmr++; 1658 } 1659 1660 return (0); 1661 } 1662 1663 /* 1664 * read_mem 1665 * 1666 * Reads memory at guest paddr 'src' into 'buf'. 1667 * 1668 * Parameters: 1669 * src: the source paddr_t in the guest VM to read from. 1670 * buf: destination (local) buffer 1671 * len: number of bytes to read 1672 * 1673 * Return values: 1674 * 0: success 1675 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1676 * exist in the guest. 1677 */ 1678 int 1679 read_mem(paddr_t src, void *buf, size_t len) 1680 { 1681 char *from, *to = buf; 1682 size_t n, off; 1683 struct vm_mem_range *vmr; 1684 1685 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); 1686 if (vmr == NULL) { 1687 errno = EINVAL; 1688 log_warn("%s: failed - invalid memory range src = 0x%lx, " 1689 "len = 0x%zx", __func__, src, len); 1690 return (EINVAL); 1691 } 1692 1693 off = src - vmr->vmr_gpa; 1694 while (len != 0) { 1695 n = vmr->vmr_size - off; 1696 if (len < n) 1697 n = len; 1698 1699 from = (char *)vmr->vmr_va + off; 1700 memcpy(to, from, n); 1701 1702 to += n; 1703 len -= n; 1704 off = 0; 1705 vmr++; 1706 } 1707 1708 return (0); 1709 } 1710 1711 int 1712 iovec_mem(paddr_t src, size_t len, struct iovec *iov, int iovcnt) 1713 { 1714 size_t n, off; 1715 struct vm_mem_range *vmr; 1716 int niov = 0; 1717 1718 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); 1719 if (vmr == NULL) { 1720 errno = EINVAL; 1721 return (-1); 1722 } 1723 1724 off = src - vmr->vmr_gpa; 1725 while (len > 0) { 1726 if (niov == iovcnt) { 1727 errno = ENOMEM; 1728 return (-1); 1729 } 1730 1731 n = vmr->vmr_size - off; 1732 if (len < n) 1733 n = len; 1734 1735 iov[niov].iov_base = (char *)vmr->vmr_va + off; 1736 iov[niov].iov_len = n; 1737 1738 niov++; 1739 1740 len -= n; 1741 off = 0; 1742 vmr++; 1743 } 1744 1745 return (niov); 1746 } 1747 1748 /* 1749 * vcpu_assert_pic_irq 1750 * 1751 * Injects the specified IRQ on the supplied vcpu/vm 1752 * 1753 * Parameters: 1754 * vm_id: VM ID to inject to 1755 * vcpu_id: VCPU ID to inject to 1756 * irq: IRQ to inject 1757 */ 1758 void 1759 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 1760 { 1761 int ret; 1762 1763 i8259_assert_irq(irq); 1764 1765 if (i8259_is_pending()) { 1766 if (vcpu_pic_intr(vm_id, vcpu_id, 1)) 1767 fatalx("%s: can't assert INTR", __func__); 1768 1769 ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]); 1770 if (ret) 1771 fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret); 1772 1773 vcpu_hlt[vcpu_id] = 0; 1774 ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); 1775 if (ret) 1776 fatalx("%s: can't signal (%d)", __func__, ret); 1777 ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]); 1778 if (ret) 1779 fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret); 1780 } 1781 } 1782 1783 /* 1784 * vcpu_deassert_pic_irq 1785 * 1786 * Clears the specified IRQ on the supplied vcpu/vm 1787 * 1788 * Parameters: 1789 * vm_id: VM ID to clear in 1790 * vcpu_id: VCPU ID to clear in 1791 * irq: IRQ to clear 1792 */ 1793 void 1794 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 1795 { 1796 i8259_deassert_irq(irq); 1797 1798 if (!i8259_is_pending()) { 1799 if (vcpu_pic_intr(vm_id, vcpu_id, 0)) 1800 fatalx("%s: can't deassert INTR for vm_id %d, " 1801 "vcpu_id %d", __func__, vm_id, vcpu_id); 1802 } 1803 } 1804 1805 /* 1806 * fd_hasdata 1807 * 1808 * Determines if data can be read from a file descriptor. 1809 * 1810 * Parameters: 1811 * fd: the fd to check 1812 * 1813 * Return values: 1814 * 1 if data can be read from an fd, or 0 otherwise. 1815 */ 1816 int 1817 fd_hasdata(int fd) 1818 { 1819 struct pollfd pfd[1]; 1820 int nready, hasdata = 0; 1821 1822 pfd[0].fd = fd; 1823 pfd[0].events = POLLIN; 1824 nready = poll(pfd, 1, 0); 1825 if (nready == -1) 1826 log_warn("checking file descriptor for data failed"); 1827 else if (nready == 1 && pfd[0].revents & POLLIN) 1828 hasdata = 1; 1829 return (hasdata); 1830 } 1831 1832 /* 1833 * mutex_lock 1834 * 1835 * Wrapper function for pthread_mutex_lock that does error checking and that 1836 * exits on failure 1837 */ 1838 void 1839 mutex_lock(pthread_mutex_t *m) 1840 { 1841 int ret; 1842 1843 ret = pthread_mutex_lock(m); 1844 if (ret) { 1845 errno = ret; 1846 fatal("could not acquire mutex"); 1847 } 1848 } 1849 1850 /* 1851 * mutex_unlock 1852 * 1853 * Wrapper function for pthread_mutex_unlock that does error checking and that 1854 * exits on failure 1855 */ 1856 void 1857 mutex_unlock(pthread_mutex_t *m) 1858 { 1859 int ret; 1860 1861 ret = pthread_mutex_unlock(m); 1862 if (ret) { 1863 errno = ret; 1864 fatal("could not release mutex"); 1865 } 1866 } 1867 1868 /* 1869 * set_return_data 1870 * 1871 * Utility function for manipulating register data in vm exit info structs. This 1872 * function ensures that the data is copied to the vei->vei.vei_data field with 1873 * the proper size for the operation being performed. 1874 * 1875 * Parameters: 1876 * vei: exit information 1877 * data: return data 1878 */ 1879 void 1880 set_return_data(struct vm_exit *vei, uint32_t data) 1881 { 1882 switch (vei->vei.vei_size) { 1883 case 1: 1884 vei->vei.vei_data &= ~0xFF; 1885 vei->vei.vei_data |= (uint8_t)data; 1886 break; 1887 case 2: 1888 vei->vei.vei_data &= ~0xFFFF; 1889 vei->vei.vei_data |= (uint16_t)data; 1890 break; 1891 case 4: 1892 vei->vei.vei_data = data; 1893 break; 1894 } 1895 } 1896 1897 /* 1898 * get_input_data 1899 * 1900 * Utility function for manipulating register data in vm exit info 1901 * structs. This function ensures that the data is copied from the 1902 * vei->vei.vei_data field with the proper size for the operation being 1903 * performed. 1904 * 1905 * Parameters: 1906 * vei: exit information 1907 * data: location to store the result 1908 */ 1909 void 1910 get_input_data(struct vm_exit *vei, uint32_t *data) 1911 { 1912 switch (vei->vei.vei_size) { 1913 case 1: 1914 *data &= 0xFFFFFF00; 1915 *data |= (uint8_t)vei->vei.vei_data; 1916 break; 1917 case 2: 1918 *data &= 0xFFFF0000; 1919 *data |= (uint16_t)vei->vei.vei_data; 1920 break; 1921 case 4: 1922 *data = vei->vei.vei_data; 1923 break; 1924 default: 1925 log_warnx("%s: invalid i/o size %d", __func__, 1926 vei->vei.vei_size); 1927 } 1928 1929 } 1930