xref: /openbsd-src/usr.sbin/vmd/vmm.c (revision fc405d53b73a2d73393cb97f684863d17b583e38)
1 /*	$OpenBSD: vmm.c,v 1.112 2023/05/13 23:15:28 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/ioctl.h>
21 #include <sys/queue.h>
22 #include <sys/wait.h>
23 #include <sys/uio.h>
24 #include <sys/socket.h>
25 #include <sys/time.h>
26 #include <sys/mman.h>
27 
28 #include <dev/ic/i8253reg.h>
29 #include <dev/isa/isareg.h>
30 #include <dev/pci/pcireg.h>
31 
32 #include <machine/psl.h>
33 #include <machine/specialreg.h>
34 #include <machine/vmmvar.h>
35 
36 #include <net/if.h>
37 
38 #include <errno.h>
39 #include <event.h>
40 #include <fcntl.h>
41 #include <imsg.h>
42 #include <limits.h>
43 #include <poll.h>
44 #include <pthread.h>
45 #include <stddef.h>
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <unistd.h>
50 #include <util.h>
51 
52 #include "vmd.h"
53 #include "vmm.h"
54 #include "atomicio.h"
55 
56 void	vmm_sighdlr(int, short, void *);
57 int	vmm_start_vm(struct imsg *, uint32_t *, pid_t *);
58 int	vmm_dispatch_parent(int, struct privsep_proc *, struct imsg *);
59 void	vmm_run(struct privsep *, struct privsep_proc *, void *);
60 void	vmm_dispatch_vm(int, short, void *);
61 int	terminate_vm(struct vm_terminate_params *);
62 int	get_info_vm(struct privsep *, struct imsg *, int);
63 int	opentap(char *);
64 
65 extern struct vmd *env;
66 
67 static struct privsep_proc procs[] = {
68 	{ "parent",	PROC_PARENT,	vmm_dispatch_parent  },
69 };
70 
71 void
72 vmm(struct privsep *ps, struct privsep_proc *p)
73 {
74 	proc_run(ps, p, procs, nitems(procs), vmm_run, NULL);
75 }
76 
77 void
78 vmm_run(struct privsep *ps, struct privsep_proc *p, void *arg)
79 {
80 	if (config_init(ps->ps_env) == -1)
81 		fatal("failed to initialize configuration");
82 
83 	/*
84 	 * We aren't root, so we can't chroot(2). Use unveil(2) instead.
85 	 */
86 	if (unveil(env->argv0, "x") == -1)
87 		fatal("unveil %s", env->argv0);
88 	if (unveil(NULL, NULL) == -1)
89 		fatal("unveil lock");
90 
91 	/*
92 	 * pledge in the vmm process:
93 	 * stdio - for malloc and basic I/O including events.
94 	 * vmm - for the vmm ioctls and operations.
95 	 * proc, exec - for forking and execing new vm's.
96 	 * sendfd - for sending send/recv fds to vm proc.
97 	 * recvfd - for disks, interfaces and other fds.
98 	 */
99 	if (pledge("stdio vmm sendfd recvfd proc exec", NULL) == -1)
100 		fatal("pledge");
101 
102 	signal_del(&ps->ps_evsigchld);
103 	signal_set(&ps->ps_evsigchld, SIGCHLD, vmm_sighdlr, ps);
104 	signal_add(&ps->ps_evsigchld, NULL);
105 }
106 
107 int
108 vmm_dispatch_parent(int fd, struct privsep_proc *p, struct imsg *imsg)
109 {
110 	struct privsep		*ps = p->p_ps;
111 	int			 res = 0, cmd = 0, verbose;
112 	struct vmd_vm		*vm = NULL;
113 	struct vm_terminate_params vtp;
114 	struct vmop_id		 vid;
115 	struct vmop_result	 vmr;
116 	struct vmop_create_params vmc;
117 	struct vmop_addr_result  var;
118 	uint32_t		 id = 0, peerid = imsg->hdr.peerid;
119 	pid_t			 pid = 0;
120 	unsigned int		 mode, flags;
121 
122 	switch (imsg->hdr.type) {
123 	case IMSG_VMDOP_START_VM_REQUEST:
124 		res = config_getvm(ps, imsg);
125 		if (res == -1) {
126 			res = errno;
127 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
128 		}
129 		break;
130 	case IMSG_VMDOP_START_VM_CDROM:
131 		res = config_getcdrom(ps, imsg);
132 		if (res == -1) {
133 			res = errno;
134 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
135 		}
136 		break;
137 	case IMSG_VMDOP_START_VM_DISK:
138 		res = config_getdisk(ps, imsg);
139 		if (res == -1) {
140 			res = errno;
141 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
142 		}
143 		break;
144 	case IMSG_VMDOP_START_VM_IF:
145 		res = config_getif(ps, imsg);
146 		if (res == -1) {
147 			res = errno;
148 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
149 		}
150 		break;
151 	case IMSG_VMDOP_START_VM_END:
152 		res = vmm_start_vm(imsg, &id, &pid);
153 		/* Check if the ID can be mapped correctly */
154 		if (res == 0 && (id = vm_id2vmid(id, NULL)) == 0)
155 			res = ENOENT;
156 		cmd = IMSG_VMDOP_START_VM_RESPONSE;
157 		break;
158 	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
159 		IMSG_SIZE_CHECK(imsg, &vid);
160 		memcpy(&vid, imsg->data, sizeof(vid));
161 		id = vid.vid_id;
162 		flags = vid.vid_flags;
163 
164 		DPRINTF("%s: recv'ed TERMINATE_VM for %d", __func__, id);
165 
166 		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
167 
168 		if (id == 0) {
169 			res = ENOENT;
170 		} else if ((vm = vm_getbyvmid(id)) != NULL) {
171 			if (flags & VMOP_FORCE) {
172 				vtp.vtp_vm_id = vm_vmid2id(vm->vm_vmid, vm);
173 				vm->vm_state |= VM_STATE_SHUTDOWN;
174 				(void)terminate_vm(&vtp);
175 				res = 0;
176 			} else if (!(vm->vm_state & VM_STATE_SHUTDOWN)) {
177 				log_debug("%s: sending shutdown request"
178 				    " to vm %d", __func__, id);
179 
180 				/*
181 				 * Request reboot but mark the VM as shutting
182 				 * down. This way we can terminate the VM after
183 				 * the triple fault instead of reboot and
184 				 * avoid being stuck in the ACPI-less powerdown
185 				 * ("press any key to reboot") of the VM.
186 				 */
187 				vm->vm_state |= VM_STATE_SHUTDOWN;
188 				if (imsg_compose_event(&vm->vm_iev,
189 				    IMSG_VMDOP_VM_REBOOT,
190 				    0, 0, -1, NULL, 0) == -1)
191 					res = errno;
192 				else
193 					res = 0;
194 			} else {
195 				/*
196 				 * VM is currently being shutdown.
197 				 * Check to see if the VM process is still
198 				 * active.  If not, return VMD_VM_STOP_INVALID.
199 				 */
200 				if (vm_vmid2id(vm->vm_vmid, vm) == 0) {
201 					log_debug("%s: no vm running anymore",
202 					    __func__);
203 					res = VMD_VM_STOP_INVALID;
204 				}
205 			}
206 		} else {
207 			/* VM doesn't exist, cannot stop vm */
208 			log_debug("%s: cannot stop vm that is not running",
209 			    __func__);
210 			res = VMD_VM_STOP_INVALID;
211 		}
212 		break;
213 	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
214 		res = get_info_vm(ps, imsg, 0);
215 		cmd = IMSG_VMDOP_GET_INFO_VM_END_DATA;
216 		break;
217 	case IMSG_VMDOP_CONFIG:
218 		config_getconfig(env, imsg);
219 		break;
220 	case IMSG_CTL_RESET:
221 		IMSG_SIZE_CHECK(imsg, &mode);
222 		memcpy(&mode, imsg->data, sizeof(mode));
223 
224 		if (mode & CONFIG_VMS) {
225 			/* Terminate and remove all VMs */
226 			vmm_shutdown();
227 			mode &= ~CONFIG_VMS;
228 		}
229 
230 		config_getreset(env, imsg);
231 		break;
232 	case IMSG_CTL_VERBOSE:
233 		IMSG_SIZE_CHECK(imsg, &verbose);
234 		memcpy(&verbose, imsg->data, sizeof(verbose));
235 		log_setverbose(verbose);
236 
237 		/* Forward message to each VM process */
238 		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
239 			imsg_compose_event(&vm->vm_iev,
240 			    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
241 			    -1, &verbose, sizeof(verbose));
242 		}
243 		break;
244 	case IMSG_VMDOP_PAUSE_VM:
245 		IMSG_SIZE_CHECK(imsg, &vid);
246 		memcpy(&vid, imsg->data, sizeof(vid));
247 		id = vid.vid_id;
248 		if ((vm = vm_getbyvmid(id)) == NULL) {
249 			res = ENOENT;
250 			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
251 			break;
252 		}
253 		imsg_compose_event(&vm->vm_iev,
254 		    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
255 		    imsg->fd, &vid, sizeof(vid));
256 		break;
257 	case IMSG_VMDOP_UNPAUSE_VM:
258 		IMSG_SIZE_CHECK(imsg, &vid);
259 		memcpy(&vid, imsg->data, sizeof(vid));
260 		id = vid.vid_id;
261 		if ((vm = vm_getbyvmid(id)) == NULL) {
262 			res = ENOENT;
263 			cmd = IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
264 			break;
265 		}
266 		imsg_compose_event(&vm->vm_iev,
267 		    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
268 		    imsg->fd, &vid, sizeof(vid));
269 		break;
270 	case IMSG_VMDOP_SEND_VM_REQUEST:
271 		IMSG_SIZE_CHECK(imsg, &vid);
272 		memcpy(&vid, imsg->data, sizeof(vid));
273 		id = vid.vid_id;
274 		if ((vm = vm_getbyvmid(id)) == NULL) {
275 			res = ENOENT;
276 			close(imsg->fd);
277 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
278 			break;
279 		}
280 		imsg_compose_event(&vm->vm_iev,
281 		    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
282 		    imsg->fd, &vid, sizeof(vid));
283 		break;
284 	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
285 		IMSG_SIZE_CHECK(imsg, &vmc);
286 		memcpy(&vmc, imsg->data, sizeof(vmc));
287 		if (vm_register(ps, &vmc, &vm,
288 		    imsg->hdr.peerid, vmc.vmc_owner.uid) != 0) {
289 			res = errno;
290 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
291 			break;
292 		}
293 		vm->vm_tty = imsg->fd;
294 		vm->vm_state |= VM_STATE_RECEIVED;
295 		vm->vm_state |= VM_STATE_PAUSED;
296 		break;
297 	case IMSG_VMDOP_RECEIVE_VM_END:
298 		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
299 			res = ENOENT;
300 			close(imsg->fd);
301 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
302 			break;
303 		}
304 		vm->vm_receive_fd = imsg->fd;
305 		res = vmm_start_vm(imsg, &id, &pid);
306 		/* Check if the ID can be mapped correctly */
307 		if ((id = vm_id2vmid(id, NULL)) == 0)
308 			res = ENOENT;
309 		cmd = IMSG_VMDOP_START_VM_RESPONSE;
310 		break;
311 	case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
312 		IMSG_SIZE_CHECK(imsg, &var);
313 		memcpy(&var, imsg->data, sizeof(var));
314 		if ((vm = vm_getbyvmid(var.var_vmid)) == NULL) {
315 			res = ENOENT;
316 			break;
317 		}
318 		/* Forward hardware address details to the guest vm */
319 		imsg_compose_event(&vm->vm_iev,
320 		    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
321 		    imsg->fd, &var, sizeof(var));
322 		break;
323 	case IMSG_VMDOP_RECEIVE_VMM_FD:
324 		if (env->vmd_fd > -1)
325 			fatalx("already received vmm fd");
326 		env->vmd_fd = imsg->fd;
327 
328 		/* Get and terminate all running VMs */
329 		get_info_vm(ps, NULL, 1);
330 		break;
331 	default:
332 		return (-1);
333 	}
334 
335 	switch (cmd) {
336 	case 0:
337 		break;
338 	case IMSG_VMDOP_START_VM_RESPONSE:
339 		if (res != 0) {
340 			/* Remove local reference if it exists */
341 			if ((vm = vm_getbyvmid(imsg->hdr.peerid)) != NULL) {
342 				log_debug("%s: removing vm, START_VM_RESPONSE",
343 				    __func__);
344 				vm_remove(vm, __func__);
345 			}
346 		}
347 		if (id == 0)
348 			id = imsg->hdr.peerid;
349 		/* FALLTHROUGH */
350 	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
351 	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
352 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
353 		memset(&vmr, 0, sizeof(vmr));
354 		vmr.vmr_result = res;
355 		vmr.vmr_id = id;
356 		vmr.vmr_pid = pid;
357 		if (proc_compose_imsg(ps, PROC_PARENT, -1, cmd,
358 		    peerid, -1, &vmr, sizeof(vmr)) == -1)
359 			return (-1);
360 		break;
361 	default:
362 		if (proc_compose_imsg(ps, PROC_PARENT, -1, cmd,
363 		    peerid, -1, &res, sizeof(res)) == -1)
364 			return (-1);
365 		break;
366 	}
367 
368 	return (0);
369 }
370 
371 void
372 vmm_sighdlr(int sig, short event, void *arg)
373 {
374 	struct privsep *ps = arg;
375 	int status, ret = 0;
376 	uint32_t vmid;
377 	pid_t pid;
378 	struct vmop_result vmr;
379 	struct vmd_vm *vm;
380 	struct vm_terminate_params vtp;
381 
382 	log_debug("%s: handling signal %d", __func__, sig);
383 	switch (sig) {
384 	case SIGCHLD:
385 		do {
386 			pid = waitpid(-1, &status, WNOHANG);
387 			if (pid <= 0)
388 				continue;
389 
390 			if (WIFEXITED(status) || WIFSIGNALED(status)) {
391 				vm = vm_getbypid(pid);
392 				if (vm == NULL) {
393 					/*
394 					 * If the VM is gone already, it
395 					 * got terminated via a
396 					 * IMSG_VMDOP_TERMINATE_VM_REQUEST.
397 					 */
398 					continue;
399 				}
400 
401 				if (WIFEXITED(status))
402 					ret = WEXITSTATUS(status);
403 
404 				/* Don't reboot on pending shutdown */
405 				if (ret == EAGAIN &&
406 				    (vm->vm_state & VM_STATE_SHUTDOWN))
407 					ret = 0;
408 
409 				vmid = vm->vm_params.vmc_params.vcp_id;
410 				vtp.vtp_vm_id = vmid;
411 
412 				if (terminate_vm(&vtp) == 0)
413 					log_debug("%s: terminated vm %s"
414 					    " (id %d)", __func__,
415 					    vm->vm_params.vmc_params.vcp_name,
416 					    vm->vm_vmid);
417 
418 				memset(&vmr, 0, sizeof(vmr));
419 				vmr.vmr_result = ret;
420 				vmr.vmr_id = vm_id2vmid(vmid, vm);
421 				if (proc_compose_imsg(ps, PROC_PARENT,
422 				    -1, IMSG_VMDOP_TERMINATE_VM_EVENT,
423 				    vm->vm_peerid, -1,
424 				    &vmr, sizeof(vmr)) == -1)
425 					log_warnx("could not signal "
426 					    "termination of VM %u to "
427 					    "parent", vm->vm_vmid);
428 
429 				vm_remove(vm, __func__);
430 			} else
431 				fatalx("unexpected cause of SIGCHLD");
432 		} while (pid > 0 || (pid == -1 && errno == EINTR));
433 		break;
434 	default:
435 		fatalx("unexpected signal");
436 	}
437 }
438 
439 /*
440  * vmm_shutdown
441  *
442  * Terminate VMs on shutdown to avoid "zombie VM" processes.
443  */
444 void
445 vmm_shutdown(void)
446 {
447 	struct vm_terminate_params vtp;
448 	struct vmd_vm *vm, *vm_next;
449 
450 	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
451 		vtp.vtp_vm_id = vm_vmid2id(vm->vm_vmid, vm);
452 
453 		/* XXX suspend or request graceful shutdown */
454 		(void)terminate_vm(&vtp);
455 		vm_remove(vm, __func__);
456 	}
457 }
458 
459 /*
460  * vmm_pipe
461  *
462  * Create a new imsg control channel between vmm parent and a VM
463  * (can be called on both sides).
464  */
465 int
466 vmm_pipe(struct vmd_vm *vm, int fd, void (*cb)(int, short, void *))
467 {
468 	struct imsgev	*iev = &vm->vm_iev;
469 
470 	if (fcntl(fd, F_SETFL, O_NONBLOCK) == -1) {
471 		log_warn("failed to set nonblocking mode on vm pipe");
472 		return (-1);
473 	}
474 
475 	imsg_init(&iev->ibuf, fd);
476 	iev->handler = cb;
477 	iev->data = vm;
478 	imsg_event_add(iev);
479 
480 	return (0);
481 }
482 
483 /*
484  * vmm_dispatch_vm
485  *
486  * imsg callback for messages that are received from a VM child process.
487  */
488 void
489 vmm_dispatch_vm(int fd, short event, void *arg)
490 {
491 	struct vmd_vm		*vm = arg;
492 	struct vmop_result	 vmr;
493 	struct imsgev		*iev = &vm->vm_iev;
494 	struct imsgbuf		*ibuf = &iev->ibuf;
495 	struct imsg		 imsg;
496 	ssize_t			 n;
497 	unsigned int		 i;
498 
499 	if (event & EV_READ) {
500 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
501 			fatal("%s: imsg_read", __func__);
502 		if (n == 0) {
503 			/* This pipe is dead, so remove the event handler */
504 			event_del(&iev->ev);
505 			return;
506 		}
507 	}
508 
509 	if (event & EV_WRITE) {
510 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
511 			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
512 		if (n == 0) {
513 			/* This pipe is dead, so remove the event handler */
514 			event_del(&iev->ev);
515 			return;
516 		}
517 	}
518 
519 	for (;;) {
520 		if ((n = imsg_get(ibuf, &imsg)) == -1)
521 			fatal("%s: imsg_get", __func__);
522 		if (n == 0)
523 			break;
524 
525 		DPRINTF("%s: got imsg %d from %s",
526 		    __func__, imsg.hdr.type,
527 		    vm->vm_params.vmc_params.vcp_name);
528 
529 		switch (imsg.hdr.type) {
530 		case IMSG_VMDOP_VM_SHUTDOWN:
531 			vm->vm_state |= VM_STATE_SHUTDOWN;
532 			break;
533 		case IMSG_VMDOP_VM_REBOOT:
534 			vm->vm_state &= ~VM_STATE_SHUTDOWN;
535 			break;
536 		case IMSG_VMDOP_SEND_VM_RESPONSE:
537 			IMSG_SIZE_CHECK(&imsg, &vmr);
538 		case IMSG_VMDOP_PAUSE_VM_RESPONSE:
539 		case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
540 			for (i = 0; i < nitems(procs); i++) {
541 				if (procs[i].p_id == PROC_PARENT) {
542 					proc_forward_imsg(procs[i].p_ps,
543 					    &imsg, PROC_PARENT, -1);
544 					break;
545 				}
546 			}
547 			break;
548 
549 		default:
550 			fatalx("%s: got invalid imsg %d from %s",
551 			    __func__, imsg.hdr.type,
552 			    vm->vm_params.vmc_params.vcp_name);
553 		}
554 		imsg_free(&imsg);
555 	}
556 	imsg_event_add(iev);
557 }
558 
559 /*
560  * terminate_vm
561  *
562  * Requests vmm(4) to terminate the VM whose ID is provided in the
563  * supplied vm_terminate_params structure (vtp->vtp_vm_id)
564  *
565  * Parameters
566  *  vtp: vm_terminate_params struct containing the ID of the VM to terminate
567  *
568  * Return values:
569  *  0: success
570  *  !0: ioctl to vmm(4) failed (eg, ENOENT if the supplied VM is not valid)
571  */
572 int
573 terminate_vm(struct vm_terminate_params *vtp)
574 {
575 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, vtp) == -1)
576 		return (errno);
577 
578 	return (0);
579 }
580 
581 /*
582  * opentap
583  *
584  * Opens the next available tap device, up to MAX_TAP.
585  *
586  * Parameters
587  *  ifname: a buffer of at least IF_NAMESIZE bytes.
588  *
589  * Returns a file descriptor to the tap node opened, or -1 if no tap
590  * devices were available.
591  */
592 int
593 opentap(char *ifname)
594 {
595 	int i, fd;
596 	char path[PATH_MAX];
597 
598 	for (i = 0; i < MAX_TAP; i++) {
599 		snprintf(path, PATH_MAX, "/dev/tap%d", i);
600 		fd = open(path, O_RDWR | O_NONBLOCK);
601 		if (fd != -1) {
602 			snprintf(ifname, IF_NAMESIZE, "tap%d", i);
603 			return (fd);
604 		}
605 	}
606 	strlcpy(ifname, "tap", IF_NAMESIZE);
607 
608 	return (-1);
609 }
610 
611 /*
612  * vmm_start_vm
613  *
614  * Prepares and fork+execs a new VM process.
615  *
616  * Parameters:
617  *  imsg: The VM data structure that is including the VM create parameters.
618  *  id: Returns the VM id as reported by the kernel and obtained from the VM.
619  *  pid: Returns the VM pid to the parent.
620  *
621  * Return values:
622  *  0: success
623  *  !0: failure - typically an errno indicating the source of the failure
624  */
625 int
626 vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
627 {
628 	struct vm_create_params	*vcp;
629 	struct vmd_vm		*vm;
630 	char			*nargv[8], num[32], vmm_fd[32];
631 	int			 fd, ret = EINVAL;
632 	int			 fds[2];
633 	pid_t			 vm_pid;
634 	size_t			 i, j, sz;
635 
636 	if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
637 		log_warnx("%s: can't find vm", __func__);
638 		ret = ENOENT;
639 		goto err;
640 	}
641 	vcp = &vm->vm_params.vmc_params;
642 
643 	if (!(vm->vm_state & VM_STATE_RECEIVED)) {
644 		if ((vm->vm_tty = imsg->fd) == -1) {
645 			log_warnx("%s: can't get tty", __func__);
646 			goto err;
647 		}
648 	}
649 
650 	if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, fds) == -1)
651 		fatal("socketpair");
652 
653 	/* Keep our channel open after exec. */
654 	if (fcntl(fds[1], F_SETFD, 0)) {
655 		ret = errno;
656 		log_warn("%s: fcntl", __func__);
657 		goto err;
658 	}
659 
660 	/* Start child vmd for this VM (fork, chroot, drop privs) */
661 	vm_pid = fork();
662 	if (vm_pid == -1) {
663 		log_warn("%s: start child failed", __func__);
664 		ret = EIO;
665 		goto err;
666 	}
667 
668 	if (vm_pid > 0) {
669 		/* Parent */
670 		vm->vm_pid = vm_pid;
671 		close_fd(fds[1]);
672 
673 		/* Send the details over the pipe to the child. */
674 		sz = atomicio(vwrite, fds[0], vm, sizeof(*vm));
675 		if (sz != sizeof(*vm)) {
676 			log_warnx("%s: failed to send config for vm '%s'",
677 			    __func__, vcp->vcp_name);
678 			ret = EIO;
679 			/* Defer error handling until after fd closing. */
680 		}
681 
682 		/* As the parent/vmm process, we no longer need these fds. */
683 		for (i = 0 ; i < vm->vm_params.vmc_ndisks; i++) {
684 			for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
685 				if (close_fd(vm->vm_disks[i][j]) == 0)
686 				    vm->vm_disks[i][j] = -1;
687 			}
688 		}
689 		for (i = 0 ; i < vm->vm_params.vmc_nnics; i++) {
690 			if (close_fd(vm->vm_ifs[i].vif_fd) == 0)
691 			    vm->vm_ifs[i].vif_fd = -1;
692 		}
693 		if (close_fd(vm->vm_kernel) == 0)
694 			vm->vm_kernel = -1;
695 		if (close_fd(vm->vm_cdrom) == 0)
696 			vm->vm_cdrom = -1;
697 		if (close_fd(vm->vm_tty) == 0)
698 			vm->vm_tty = -1;
699 
700 		/* Deferred error handling from sending the vm struct. */
701 		if (ret == EIO)
702 			goto err;
703 
704 		/* Read back the kernel-generated vm id from the child */
705 		sz = atomicio(read, fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id));
706 		if (sz != sizeof(vcp->vcp_id)) {
707 			log_debug("%s: failed to receive vm id from vm %s",
708 			    __func__, vcp->vcp_name);
709 			/* vmd could not allocate memory for the vm. */
710 			ret = ENOMEM;
711 			goto err;
712 		}
713 
714 		/* Check for an invalid id. This indicates child failure. */
715 		if (vcp->vcp_id == 0)
716 			goto err;
717 
718 		*id = vcp->vcp_id;
719 		*pid = vm->vm_pid;
720 
721 		/* Wire up our pipe into the event handling. */
722 		if (vmm_pipe(vm, fds[0], vmm_dispatch_vm) == -1)
723 			fatal("setup vm pipe");
724 
725 		return (0);
726 	} else {
727 		/* Child. Create a new session. */
728 		if (setsid() == -1)
729 			fatal("setsid");
730 
731 		close_fd(fds[0]);
732 		close_fd(PROC_PARENT_SOCK_FILENO);
733 
734 		/* Detach from terminal. */
735 		if (!env->vmd_debug && (fd =
736 			open("/dev/null", O_RDWR, 0)) != -1) {
737 			dup2(fd, STDIN_FILENO);
738 			dup2(fd, STDOUT_FILENO);
739 			dup2(fd, STDERR_FILENO);
740 			if (fd > 2)
741 				close(fd);
742 		}
743 
744 		/* Toggle all fds to not close on exec. */
745 		for (i = 0 ; i < vm->vm_params.vmc_ndisks; i++)
746 			for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
747 				if (vm->vm_disks[i][j] != -1)
748 					fcntl(vm->vm_disks[i][j], F_SETFD, 0);
749 		for (i = 0 ; i < vm->vm_params.vmc_nnics; i++)
750 			fcntl(vm->vm_ifs[i].vif_fd, F_SETFD, 0);
751 		if (vm->vm_kernel != -1)
752 			fcntl(vm->vm_kernel, F_SETFD, 0);
753 		if (vm->vm_cdrom != -1)
754 			fcntl(vm->vm_cdrom, F_SETFD, 0);
755 		if (vm->vm_tty != -1)
756 			fcntl(vm->vm_tty, F_SETFD, 0);
757 		fcntl(env->vmd_fd, F_SETFD, 0);	/* vmm device fd */
758 
759 		/*
760 		 * Prepare our new argv for execvp(2) with the fd of our open
761 		 * pipe to the parent/vmm process as an argument.
762 		 */
763 		memset(&nargv, 0, sizeof(nargv));
764 		memset(num, 0, sizeof(num));
765 		snprintf(num, sizeof(num), "%d", fds[1]);
766 		memset(vmm_fd, 0, sizeof(vmm_fd));
767 		snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd);
768 
769 		nargv[0] = env->argv0;
770 		nargv[1] = "-V";
771 		nargv[2] = num;
772 		nargv[3] = "-n";
773 		nargv[4] = "-i";
774 		nargv[5] = vmm_fd;
775 
776 		if (env->vmd_verbose) {
777 			nargv[6] = "-v";
778 			nargv[7] = NULL;
779 		} else
780 			nargv[6] = NULL;
781 
782 		/* Control resumes in vmd main(). */
783 		execvp(nargv[0], nargv);
784 
785 		ret = errno;
786 		log_warn("execvp %s", nargv[0]);
787 		_exit(ret);
788 		/* NOTREACHED */
789 	}
790 
791 	return (0);
792 
793  err:
794 	if (!vm->vm_from_config)
795 		vm_remove(vm, __func__);
796 
797 	return (ret);
798 }
799 
800 /*
801  * get_info_vm
802  *
803  * Returns a list of VMs known to vmm(4).
804  *
805  * Parameters:
806  *  ps: the privsep context.
807  *  imsg: the received imsg including the peer id.
808  *  terminate: terminate the listed vm.
809  *
810  * Return values:
811  *  0: success
812  *  !0: failure (eg, ENOMEM, EIO or another error code from vmm(4) ioctl)
813  */
814 int
815 get_info_vm(struct privsep *ps, struct imsg *imsg, int terminate)
816 {
817 	int ret;
818 	size_t ct, i;
819 	struct vm_info_params vip;
820 	struct vm_info_result *info;
821 	struct vm_terminate_params vtp;
822 	struct vmop_info_result vir;
823 
824 	/*
825 	 * We issue the VMM_IOC_INFO ioctl twice, once with an input
826 	 * buffer size of 0, which results in vmm(4) returning the
827 	 * number of bytes required back to us in vip.vip_size,
828 	 * and then we call it again after malloc'ing the required
829 	 * number of bytes.
830 	 *
831 	 * It is possible that we could fail a second time (e.g. if
832 	 * another VM was created in the instant between the two
833 	 * ioctls, but in that case the caller can just try again
834 	 * as vmm(4) will return a zero-sized list in that case.
835 	 */
836 	vip.vip_size = 0;
837 	info = NULL;
838 	ret = 0;
839 	memset(&vir, 0, sizeof(vir));
840 
841 	/* First ioctl to see how many bytes needed (vip.vip_size) */
842 	if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) == -1)
843 		return (errno);
844 
845 	if (vip.vip_info_ct != 0)
846 		return (EIO);
847 
848 	info = malloc(vip.vip_size);
849 	if (info == NULL)
850 		return (ENOMEM);
851 
852 	/* Second ioctl to get the actual list */
853 	vip.vip_info = info;
854 	if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) == -1) {
855 		ret = errno;
856 		free(info);
857 		return (ret);
858 	}
859 
860 	/* Return info */
861 	ct = vip.vip_size / sizeof(struct vm_info_result);
862 	for (i = 0; i < ct; i++) {
863 		if (terminate) {
864 			vtp.vtp_vm_id = info[i].vir_id;
865 			if ((ret = terminate_vm(&vtp)) != 0)
866 				break;
867 			log_debug("%s: terminated vm %s (id %d)", __func__,
868 			    info[i].vir_name, info[i].vir_id);
869 			continue;
870 		}
871 		memcpy(&vir.vir_info, &info[i], sizeof(vir.vir_info));
872 		vir.vir_info.vir_id = vm_id2vmid(info[i].vir_id, NULL);
873 		if (proc_compose_imsg(ps, PROC_PARENT, -1,
874 		    IMSG_VMDOP_GET_INFO_VM_DATA, imsg->hdr.peerid, -1,
875 		    &vir, sizeof(vir)) == -1) {
876 			ret = EIO;
877 			break;
878 		}
879 	}
880 	free(info);
881 
882 	return (ret);
883 }
884