xref: /openbsd-src/usr.sbin/vmd/vmd.c (revision fc405d53b73a2d73393cb97f684863d17b583e38)
1 /*	$OpenBSD: vmd.c,v 1.149 2023/05/13 23:15:28 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/queue.h>
21 #include <sys/wait.h>
22 #include <sys/stat.h>
23 #include <sys/sysctl.h>
24 #include <sys/tty.h>
25 #include <sys/ttycom.h>
26 #include <sys/ioctl.h>
27 
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <termios.h>
32 #include <errno.h>
33 #include <event.h>
34 #include <fcntl.h>
35 #include <pwd.h>
36 #include <signal.h>
37 #include <syslog.h>
38 #include <unistd.h>
39 #include <util.h>
40 #include <ctype.h>
41 #include <grp.h>
42 
43 #include <machine/specialreg.h>
44 #include <machine/vmmvar.h>
45 
46 #include "proc.h"
47 #include "atomicio.h"
48 #include "vmd.h"
49 
50 __dead void usage(void);
51 
52 int	 main(int, char **);
53 int	 vmd_configure(void);
54 void	 vmd_sighdlr(int sig, short event, void *arg);
55 void	 vmd_shutdown(void);
56 int	 vmd_control_run(void);
57 int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
58 int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
59 int	 vmd_dispatch_agentx(int, struct privsep_proc *, struct imsg *);
60 int	 vmd_dispatch_priv(int, struct privsep_proc *, struct imsg *);
61 int	 vmd_check_vmh(struct vm_dump_header *);
62 
63 int	 vm_instance(struct privsep *, struct vmd_vm **,
64 	    struct vmop_create_params *, uid_t);
65 int	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
66 int	 vm_claimid(const char *, int, uint32_t *);
67 void	 start_vm_batch(int, short, void*);
68 
69 static inline void vm_terminate(struct vmd_vm *, const char *);
70 
71 struct vmd	*env;
72 
73 static struct privsep_proc procs[] = {
74 	/* Keep "priv" on top as procs[0] */
75 	{ "priv",	PROC_PRIV,	vmd_dispatch_priv, priv },
76 	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
77 	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm,
78 	  vmm_shutdown, "/" },
79 	{ "agentx", 	PROC_AGENTX,	vmd_dispatch_agentx, vm_agentx,
80 	  vm_agentx_shutdown, "/" }
81 };
82 
83 enum privsep_procid privsep_process;
84 
85 struct event staggered_start_timer;
86 
87 /* For the privileged process */
88 static struct privsep_proc *proc_priv = &procs[0];
89 static struct passwd proc_privpw;
90 static const uint8_t zero_mac[ETHER_ADDR_LEN];
91 
92 const char		 default_conffile[] = VMD_CONF;
93 const char		*conffile = default_conffile;
94 
95 int
96 vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
97 {
98 	struct privsep			*ps = p->p_ps;
99 	int				 res = 0, ret = 0, cmd = 0, verbose;
100 	unsigned int			 v = 0, flags;
101 	struct vmop_create_params	 vmc;
102 	struct vmop_id			 vid;
103 	struct vmop_result		 vmr;
104 	struct vm_dump_header		 vmh;
105 	struct vmd_vm			*vm = NULL;
106 	char				*str = NULL;
107 	uint32_t			 id = 0;
108 	struct control_sock		*rcs;
109 
110 	switch (imsg->hdr.type) {
111 	case IMSG_VMDOP_START_VM_REQUEST:
112 		IMSG_SIZE_CHECK(imsg, &vmc);
113 		memcpy(&vmc, imsg->data, sizeof(vmc));
114 		vmc.vmc_kernel = imsg->fd;
115 
116 		/* Try registering our VM in our list of known VMs. */
117 		if (vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid)) {
118 			res = errno;
119 
120 			/* Did we have a failure during lookup of a parent? */
121 			if (vm == NULL) {
122 				cmd = IMSG_VMDOP_START_VM_RESPONSE;
123 				break;
124 			}
125 
126 			/* Does the VM already exist? */
127 			if (res == EALREADY) {
128 				/* Is it already running? */
129 				if (vm->vm_state & VM_STATE_RUNNING) {
130 					cmd = IMSG_VMDOP_START_VM_RESPONSE;
131 					break;
132 				}
133 
134 				/* If not running, are our flags ok? */
135 				if (vmc.vmc_flags &&
136 				    vmc.vmc_flags != VMOP_CREATE_KERNEL) {
137 					cmd = IMSG_VMDOP_START_VM_RESPONSE;
138 					break;
139 				}
140 			}
141 			res = 0;
142 		}
143 
144 		/* Try to start the launch of the VM. */
145 		res = config_setvm(ps, vm, imsg->hdr.peerid,
146 		    vm->vm_params.vmc_owner.uid);
147 		if (res)
148 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
149 		break;
150 	case IMSG_VMDOP_WAIT_VM_REQUEST:
151 	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
152 		IMSG_SIZE_CHECK(imsg, &vid);
153 		memcpy(&vid, imsg->data, sizeof(vid));
154 		flags = vid.vid_flags;
155 		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
156 
157 		if ((id = vid.vid_id) == 0) {
158 			/* Lookup vm (id) by name */
159 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
160 				res = ENOENT;
161 				break;
162 			} else if ((vm->vm_state & VM_STATE_SHUTDOWN) &&
163 			    (flags & VMOP_FORCE) == 0) {
164 				res = EALREADY;
165 				break;
166 			} else if (!(vm->vm_state & VM_STATE_RUNNING)) {
167 				res = EINVAL;
168 				break;
169 			}
170 			id = vm->vm_vmid;
171 		} else if ((vm = vm_getbyvmid(id)) == NULL) {
172 			res = ENOENT;
173 			break;
174 		}
175 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner, vid.vid_uid)) {
176 			res = EPERM;
177 			break;
178 		}
179 
180 		/* Only relay TERMINATION requests, not WAIT requests */
181 		if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_REQUEST) {
182 			memset(&vid, 0, sizeof(vid));
183 			vid.vid_id = id;
184 			vid.vid_flags = flags;
185 
186 			if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
187 				imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
188 				return (-1);
189 		}
190 		break;
191 	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
192 		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
193 		break;
194 	case IMSG_VMDOP_LOAD:
195 		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
196 		str = get_string((uint8_t *)imsg->data,
197 		    IMSG_DATA_SIZE(imsg));
198 	case IMSG_VMDOP_RELOAD:
199 		if (vmd_reload(0, str) == -1)
200 			cmd = IMSG_CTL_FAIL;
201 		else
202 			cmd = IMSG_CTL_OK;
203 		free(str);
204 		break;
205 	case IMSG_CTL_RESET:
206 		IMSG_SIZE_CHECK(imsg, &v);
207 		memcpy(&v, imsg->data, sizeof(v));
208 		if (vmd_reload(v, NULL) == -1)
209 			cmd = IMSG_CTL_FAIL;
210 		else
211 			cmd = IMSG_CTL_OK;
212 		break;
213 	case IMSG_CTL_VERBOSE:
214 		IMSG_SIZE_CHECK(imsg, &verbose);
215 		memcpy(&verbose, imsg->data, sizeof(verbose));
216 		log_setverbose(verbose);
217 
218 		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
219 		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
220 		cmd = IMSG_CTL_OK;
221 		break;
222 	case IMSG_VMDOP_PAUSE_VM:
223 	case IMSG_VMDOP_UNPAUSE_VM:
224 		IMSG_SIZE_CHECK(imsg, &vid);
225 		memcpy(&vid, imsg->data, sizeof(vid));
226 		if (vid.vid_id == 0) {
227 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
228 				res = ENOENT;
229 				cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
230 				    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
231 				    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
232 				break;
233 			} else {
234 				vid.vid_id = vm->vm_vmid;
235 			}
236 		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
237 			res = ENOENT;
238 			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
239 			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
240 			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
241 			break;
242 		}
243 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
244 		    vid.vid_uid) != 0) {
245 			res = EPERM;
246 			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
247 			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
248 			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
249 			break;
250 		}
251 		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
252 		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
253 		break;
254 	case IMSG_VMDOP_SEND_VM_REQUEST:
255 		IMSG_SIZE_CHECK(imsg, &vid);
256 		memcpy(&vid, imsg->data, sizeof(vid));
257 		id = vid.vid_id;
258 		if (vid.vid_id == 0) {
259 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
260 				res = ENOENT;
261 				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
262 				close(imsg->fd);
263 				break;
264 			} else {
265 				vid.vid_id = vm->vm_vmid;
266 			}
267 		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
268 			res = ENOENT;
269 			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
270 			close(imsg->fd);
271 			break;
272 		}
273 		vmr.vmr_id = vid.vid_id;
274 		log_debug("%s: sending fd to vmm", __func__);
275 		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
276 		    imsg->hdr.peerid, imsg->fd, &vid, sizeof(vid));
277 		break;
278 	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
279 		IMSG_SIZE_CHECK(imsg, &vid);
280 		memcpy(&vid, imsg->data, sizeof(vid));
281 		if (imsg->fd == -1) {
282 			log_warnx("%s: invalid fd", __func__);
283 			return (-1);
284 		}
285 		if (atomicio(read, imsg->fd, &vmh, sizeof(vmh)) !=
286 		    sizeof(vmh)) {
287 			log_warnx("%s: error reading vmh from received vm",
288 			    __func__);
289 			res = EIO;
290 			close(imsg->fd);
291 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
292 			break;
293 		}
294 
295 		if (vmd_check_vmh(&vmh)) {
296 			res = ENOENT;
297 			close(imsg->fd);
298 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
299 			break;
300 		}
301 		if (atomicio(read, imsg->fd, &vmc, sizeof(vmc)) !=
302 		    sizeof(vmc)) {
303 			log_warnx("%s: error reading vmc from received vm",
304 			    __func__);
305 			res = EIO;
306 			close(imsg->fd);
307 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
308 			break;
309 		}
310 		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
311 		    sizeof(vmc.vmc_params.vcp_name));
312 		vmc.vmc_params.vcp_id = 0;
313 
314 		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
315 		if (ret != 0) {
316 			res = errno;
317 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
318 			close(imsg->fd);
319 		} else {
320 			vm->vm_state |= VM_STATE_RECEIVED;
321 			config_setvm(ps, vm, imsg->hdr.peerid,
322 			    vmc.vmc_owner.uid);
323 			log_debug("%s: sending fd to vmm", __func__);
324 			proc_compose_imsg(ps, PROC_VMM, -1,
325 			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, imsg->fd,
326 			    NULL, 0);
327 		}
328 		break;
329 	case IMSG_VMDOP_DONE:
330 		control_reset(&ps->ps_csock);
331 		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
332 			control_reset(rcs);
333 		cmd = 0;
334 		break;
335 	default:
336 		return (-1);
337 	}
338 
339 	switch (cmd) {
340 	case 0:
341 		break;
342 	case IMSG_VMDOP_START_VM_RESPONSE:
343 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
344 		memset(&vmr, 0, sizeof(vmr));
345 		vmr.vmr_result = res;
346 		vmr.vmr_id = id;
347 		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
348 		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
349 			return (-1);
350 		break;
351 	default:
352 		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
353 		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
354 			return (-1);
355 		break;
356 	}
357 
358 	return (0);
359 }
360 
361 int
362 vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
363 {
364 	struct vmop_result	 vmr;
365 	struct privsep		*ps = p->p_ps;
366 	int			 res = 0;
367 	struct vmd_vm		*vm;
368 	struct vm_create_params	*vcp;
369 	struct vmop_info_result	 vir;
370 
371 	switch (imsg->hdr.type) {
372 	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
373 		IMSG_SIZE_CHECK(imsg, &vmr);
374 		memcpy(&vmr, imsg->data, sizeof(vmr));
375 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
376 			break;
377 		proc_compose_imsg(ps, PROC_CONTROL, -1,
378 		    imsg->hdr.type, imsg->hdr.peerid, -1,
379 		    imsg->data, sizeof(imsg->data));
380 		log_info("%s: paused vm %d successfully",
381 		    vm->vm_params.vmc_params.vcp_name,
382 		    vm->vm_vmid);
383 		vm->vm_state |= VM_STATE_PAUSED;
384 		break;
385 	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
386 		IMSG_SIZE_CHECK(imsg, &vmr);
387 		memcpy(&vmr, imsg->data, sizeof(vmr));
388 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
389 			break;
390 		proc_compose_imsg(ps, PROC_CONTROL, -1,
391 		    imsg->hdr.type, imsg->hdr.peerid, -1,
392 		    imsg->data, sizeof(imsg->data));
393 		log_info("%s: unpaused vm %d successfully.",
394 		    vm->vm_params.vmc_params.vcp_name,
395 		    vm->vm_vmid);
396 		vm->vm_state &= ~VM_STATE_PAUSED;
397 		break;
398 	case IMSG_VMDOP_START_VM_RESPONSE:
399 		IMSG_SIZE_CHECK(imsg, &vmr);
400 		memcpy(&vmr, imsg->data, sizeof(vmr));
401 		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
402 			break;
403 		vm->vm_pid = vmr.vmr_pid;
404 		vcp = &vm->vm_params.vmc_params;
405 		vcp->vcp_id = vmr.vmr_id;
406 
407 		/*
408 		 * If the peerid is not -1, forward the response back to the
409 		 * the control socket.  If it is -1, the request originated
410 		 * from the parent, not the control socket.
411 		 */
412 		if (vm->vm_peerid != (uint32_t)-1) {
413 			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
414 			    sizeof(vmr.vmr_ttyname));
415 			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
416 			    imsg->hdr.type, vm->vm_peerid, -1,
417 			    &vmr, sizeof(vmr)) == -1) {
418 				errno = vmr.vmr_result;
419 				log_warn("%s: failed to forward vm result",
420 				    vcp->vcp_name);
421 				vm_terminate(vm, __func__);
422 				return (-1);
423 			}
424 		}
425 
426 		if (vmr.vmr_result) {
427 			log_warnx("%s: failed to start vm", vcp->vcp_name);
428 			vm_terminate(vm, __func__);
429 			errno = vmr.vmr_result;
430 			break;
431 		}
432 
433 		/* Now configure all the interfaces */
434 		if (vm_priv_ifconfig(ps, vm) == -1) {
435 			log_warn("%s: failed to configure vm", vcp->vcp_name);
436 			vm_terminate(vm, __func__);
437 			break;
438 		}
439 
440 		log_info("%s: started vm %d successfully, tty %s",
441 		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
442 		break;
443 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
444 		IMSG_SIZE_CHECK(imsg, &vmr);
445 		memcpy(&vmr, imsg->data, sizeof(vmr));
446 
447 		if (vmr.vmr_result) {
448 			DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
449 			    __func__, vmr.vmr_id);
450 			proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
451 		} else {
452 			if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
453 				break;
454 			/* Mark VM as shutting down */
455 			vm->vm_state |= VM_STATE_SHUTDOWN;
456 		}
457 		break;
458 	case IMSG_VMDOP_SEND_VM_RESPONSE:
459 		IMSG_SIZE_CHECK(imsg, &vmr);
460 		memcpy(&vmr, imsg->data, sizeof(vmr));
461 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
462 			break;
463 		if (!vmr.vmr_result) {
464 			log_info("%s: sent vm %d successfully.",
465 			    vm->vm_params.vmc_params.vcp_name,
466 			    vm->vm_vmid);
467 			vm_terminate(vm, __func__);
468 		}
469 
470 		/* Send a response if a control client is waiting for it */
471 		if (imsg->hdr.peerid != (uint32_t)-1) {
472 			/* the error is meaningless for deferred responses */
473 			vmr.vmr_result = 0;
474 
475 			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
476 			    IMSG_VMDOP_SEND_VM_RESPONSE,
477 			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
478 				return (-1);
479 		}
480 		break;
481 	case IMSG_VMDOP_TERMINATE_VM_EVENT:
482 		IMSG_SIZE_CHECK(imsg, &vmr);
483 		memcpy(&vmr, imsg->data, sizeof(vmr));
484 		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
485 		    __func__, vmr.vmr_id, vmr.vmr_result);
486 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
487 			log_debug("%s: vm %d is no longer available",
488 			    __func__, vmr.vmr_id);
489 			break;
490 		}
491 		if (vmr.vmr_result != EAGAIN ||
492 		    vm->vm_params.vmc_bootdevice) {
493 			vm_terminate(vm, __func__);
494 		} else {
495 			/* Stop VM instance but keep the tty open */
496 			vm_stop(vm, 1, __func__);
497 			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
498 		}
499 
500 		/* The error is meaningless for deferred responses */
501 		vmr.vmr_result = 0;
502 
503 		if (proc_compose_imsg(ps, PROC_CONTROL, -1,
504 			IMSG_VMDOP_TERMINATE_VM_EVENT,
505 			imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
506 			return (-1);
507 		break;
508 	case IMSG_VMDOP_GET_INFO_VM_DATA:
509 		IMSG_SIZE_CHECK(imsg, &vir);
510 		memcpy(&vir, imsg->data, sizeof(vir));
511 		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
512 			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
513 			if (vm->vm_ttyname[0] != '\0')
514 				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
515 				    sizeof(vir.vir_ttyname));
516 			log_debug("%s: running vm: %d, vm_state: 0x%x",
517 			    __func__, vm->vm_vmid, vm->vm_state);
518 			vir.vir_state = vm->vm_state;
519 			/* get the user id who started the vm */
520 			vir.vir_uid = vm->vm_uid;
521 			vir.vir_gid = vm->vm_params.vmc_owner.gid;
522 		}
523 		if (proc_compose_imsg(ps,
524 		    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
525 		    PROC_AGENTX : PROC_CONTROL, -1, imsg->hdr.type,
526 		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
527 			log_debug("%s: GET_INFO_VM failed for vm %d, removing",
528 			    __func__, vm->vm_vmid);
529 			vm_terminate(vm, __func__);
530 			return (-1);
531 		}
532 		break;
533 	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
534 		/*
535 		 * PROC_VMM has responded with the *running* VMs, now we
536 		 * append the others. These use the special value 0 for their
537 		 * kernel id to indicate that they are not running.
538 		 */
539 		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
540 			if (!(vm->vm_state & VM_STATE_RUNNING)) {
541 				memset(&vir, 0, sizeof(vir));
542 				vir.vir_info.vir_id = vm->vm_vmid;
543 				strlcpy(vir.vir_info.vir_name,
544 				    vm->vm_params.vmc_params.vcp_name,
545 				    VMM_MAX_NAME_LEN);
546 				vir.vir_info.vir_memory_size =
547 				    vm->vm_params.vmc_params.
548 				    vcp_memranges[0].vmr_size;
549 				vir.vir_info.vir_ncpus =
550 				    vm->vm_params.vmc_params.vcp_ncpus;
551 				/* get the configured user id for this vm */
552 				vir.vir_uid = vm->vm_params.vmc_owner.uid;
553 				vir.vir_gid = vm->vm_params.vmc_owner.gid;
554 				log_debug("%s: vm: %d, vm_state: 0x%x",
555 				    __func__, vm->vm_vmid, vm->vm_state);
556 				vir.vir_state = vm->vm_state;
557 				if (proc_compose_imsg(ps,
558 				    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
559 				    PROC_AGENTX : PROC_CONTROL, -1,
560 				    IMSG_VMDOP_GET_INFO_VM_DATA,
561 				    imsg->hdr.peerid, -1, &vir,
562 				    sizeof(vir)) == -1) {
563 					log_debug("%s: GET_INFO_VM_END failed",
564 					    __func__);
565 					vm_terminate(vm, __func__);
566 					return (-1);
567 				}
568 			}
569 		}
570 		IMSG_SIZE_CHECK(imsg, &res);
571 		proc_forward_imsg(ps, imsg,
572 		    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
573 		    PROC_AGENTX : PROC_CONTROL, -1);
574 		break;
575 	default:
576 		return (-1);
577 	}
578 
579 	return (0);
580 }
581 
582 int
583 vmd_dispatch_agentx(int fd, struct privsep_proc *p, struct imsg *imsg)
584 {
585 	struct privsep			*ps = p->p_ps;
586 
587 	switch (imsg->hdr.type) {
588 	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
589 		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
590 		return (0);
591 	default:
592 		break;
593 	}
594 	return (-1);
595 }
596 
597 int
598 vmd_dispatch_priv(int fd, struct privsep_proc *p, struct imsg *imsg)
599 {
600 	struct vmop_addr_result	 var;
601 
602 	switch (imsg->hdr.type) {
603 	case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
604 		IMSG_SIZE_CHECK(imsg, &var);
605 		memcpy(&var, imsg->data, sizeof(var));
606 		proc_forward_imsg(p->p_ps, imsg, PROC_VMM, -1);
607 		break;
608 	default:
609 		return (-1);
610 	}
611 
612 	return (0);
613 }
614 
615 int
616 vmd_check_vmh(struct vm_dump_header *vmh)
617 {
618 	int i;
619 	unsigned int code, leaf;
620 	unsigned int a, b, c, d;
621 
622 	if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
623 		log_warnx("%s: incompatible dump signature", __func__);
624 		return (-1);
625 	}
626 
627 	if (vmh->vmh_version != VM_DUMP_VERSION) {
628 		log_warnx("%s: incompatible dump version", __func__);
629 		return (-1);
630 	}
631 
632 	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
633 		code = vmh->vmh_cpuids[i].code;
634 		leaf = vmh->vmh_cpuids[i].leaf;
635 		if (leaf != 0x00) {
636 			log_debug("%s: invalid leaf 0x%x for code 0x%x",
637 			    __func__, leaf, code);
638 			return (-1);
639 		}
640 
641 		switch (code) {
642 		case 0x00:
643 			CPUID_LEAF(code, leaf, a, b, c, d);
644 			if (vmh->vmh_cpuids[i].a > a) {
645 				log_debug("%s: incompatible cpuid level",
646 				    __func__);
647 				return (-1);
648 			}
649 			if (!(vmh->vmh_cpuids[i].b == b &&
650 			    vmh->vmh_cpuids[i].c == c &&
651 			    vmh->vmh_cpuids[i].d == d)) {
652 				log_debug("%s: incompatible cpu brand",
653 				    __func__);
654 				return (-1);
655 			}
656 			break;
657 
658 		case 0x01:
659 			CPUID_LEAF(code, leaf, a, b, c, d);
660 			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
661 			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
662 				log_debug("%s: incompatible cpu features "
663 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
664 				    code, leaf);
665 				return (-1);
666 			}
667 			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
668 			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
669 				log_debug("%s: incompatible cpu features "
670 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
671 				    code, leaf);
672 				return (-1);
673 			}
674 			break;
675 
676 		case 0x07:
677 			CPUID_LEAF(code, leaf, a, b, c, d);
678 			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
679 			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
680 				log_debug("%s: incompatible cpu features "
681 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
682 				    code, leaf);
683 				return (-1);
684 			}
685 			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
686 			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
687 				log_debug("%s: incompatible cpu features "
688 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
689 				    code, leaf);
690 				return (-1);
691 			}
692 			break;
693 
694 		case 0x0d:
695 			CPUID_LEAF(code, leaf, a, b, c, d);
696 			if (vmh->vmh_cpuids[i].b > b) {
697 				log_debug("%s: incompatible cpu: insufficient "
698 				    "max save area for enabled XCR0 features",
699 				    __func__);
700 				return (-1);
701 			}
702 			if (vmh->vmh_cpuids[i].c > c) {
703 				log_debug("%s: incompatible cpu: insufficient "
704 				    "max save area for supported XCR0 features",
705 				    __func__);
706 				return (-1);
707 			}
708 			break;
709 
710 		case 0x80000001:
711 			CPUID_LEAF(code, leaf, a, b, c, d);
712 			if ((vmh->vmh_cpuids[i].a & a) !=
713 			    vmh->vmh_cpuids[i].a) {
714 				log_debug("%s: incompatible cpu features "
715 				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
716 				    code, leaf);
717 				return (-1);
718 			}
719 			if ((vmh->vmh_cpuids[i].c & c) !=
720 			    vmh->vmh_cpuids[i].c) {
721 				log_debug("%s: incompatible cpu features "
722 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
723 				    code, leaf);
724 				return (-1);
725 			}
726 			if ((vmh->vmh_cpuids[i].d & d) !=
727 			    vmh->vmh_cpuids[i].d) {
728 				log_debug("%s: incompatible cpu features "
729 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
730 				    code, leaf);
731 				return (-1);
732 			}
733 			break;
734 
735 		default:
736 			log_debug("%s: unknown code 0x%x", __func__, code);
737 			return (-1);
738 		}
739 	}
740 
741 	return (0);
742 }
743 
744 void
745 vmd_sighdlr(int sig, short event, void *arg)
746 {
747 	if (privsep_process != PROC_PARENT)
748 		return;
749 	log_debug("%s: handling signal", __func__);
750 
751 	switch (sig) {
752 	case SIGHUP:
753 		log_info("%s: reload requested with SIGHUP", __func__);
754 
755 		/*
756 		 * This is safe because libevent uses async signal handlers
757 		 * that run in the event loop and not in signal context.
758 		 */
759 		(void)vmd_reload(0, NULL);
760 		break;
761 	case SIGPIPE:
762 		log_info("%s: ignoring SIGPIPE", __func__);
763 		break;
764 	case SIGUSR1:
765 		log_info("%s: ignoring SIGUSR1", __func__);
766 		break;
767 	case SIGTERM:
768 	case SIGINT:
769 		vmd_shutdown();
770 		break;
771 	default:
772 		fatalx("unexpected signal");
773 	}
774 }
775 
776 __dead void
777 usage(void)
778 {
779 	extern char *__progname;
780 	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
781 	    __progname);
782 	exit(1);
783 }
784 
785 int
786 main(int argc, char **argv)
787 {
788 	struct privsep		*ps;
789 	int			 ch;
790 	enum privsep_procid	 proc_id = PROC_PARENT;
791 	int			 proc_instance = 0, vm_launch = 0;
792 	int			 vmm_fd = -1, vm_fd = -1;
793 	const char		*errp, *title = NULL;
794 	int			 argc0 = argc;
795 	char			 dev_type = '\0';
796 
797 	log_init(0, LOG_DAEMON);
798 
799 	if ((env = calloc(1, sizeof(*env))) == NULL)
800 		fatal("calloc: env");
801 
802 	while ((ch = getopt(argc, argv, "D:P:I:V:X:df:i:nt:v")) != -1) {
803 		switch (ch) {
804 		case 'D':
805 			if (cmdline_symset(optarg) < 0)
806 				log_warnx("could not parse macro definition %s",
807 				    optarg);
808 			break;
809 		case 'd':
810 			env->vmd_debug = 2;
811 			break;
812 		case 'f':
813 			conffile = optarg;
814 			break;
815 		case 'v':
816 			env->vmd_verbose++;
817 			break;
818 		/* vmd fork/exec */
819 		case 'n':
820 			env->vmd_noaction = 1;
821 			break;
822 		case 'P':
823 			title = optarg;
824 			proc_id = proc_getid(procs, nitems(procs), title);
825 			if (proc_id == PROC_MAX)
826 				fatalx("invalid process name");
827 			break;
828 		case 'I':
829 			proc_instance = strtonum(optarg, 0,
830 			    PROC_MAX_INSTANCES, &errp);
831 			if (errp)
832 				fatalx("invalid process instance");
833 			break;
834 		/* child vm and device fork/exec */
835 		case 'V':
836 			vm_launch = VMD_LAUNCH_VM;
837 			vm_fd = strtonum(optarg, 0, 128, &errp);
838 			if (errp)
839 				fatalx("invalid vm fd");
840 			break;
841 		case 'X':
842 			vm_launch = VMD_LAUNCH_DEV;
843 			vm_fd = strtonum(optarg, 0, 128, &errp);
844 			if (errp)
845 				fatalx("invalid device fd");
846 			break;
847 		case 't':
848 			dev_type = *optarg;
849 			switch (dev_type) {
850 			case VMD_DEVTYPE_NET:
851 			case VMD_DEVTYPE_DISK:
852 				break;
853 			default: fatalx("invalid device type");
854 			}
855 			break;
856 		case 'i':
857 			vmm_fd = strtonum(optarg, 0, 128, &errp);
858 			if (errp)
859 				fatalx("invalid vmm fd");
860 			break;
861 		default:
862 			usage();
863 		}
864 	}
865 
866 	argc -= optind;
867 	if (argc > 0)
868 		usage();
869 
870 	if (env->vmd_noaction && !env->vmd_debug)
871 		env->vmd_debug = 1;
872 
873 	log_init(env->vmd_debug, LOG_DAEMON);
874 	log_setverbose(env->vmd_verbose);
875 
876 	/* Re-exec from the vmm child process requires an absolute path. */
877 	if (proc_id == PROC_PARENT && *argv[0] != '/')
878 		fatalx("re-exec requires execution with an absolute path");
879 	env->argv0 = argv[0];
880 
881 	/* check for root privileges */
882 	if (env->vmd_noaction == 0 && !vm_launch) {
883 		if (geteuid())
884 			fatalx("need root privileges");
885 	}
886 
887 	ps = &env->vmd_ps;
888 	ps->ps_env = env;
889 	env->vmd_fd = vmm_fd;
890 
891 	if (config_init(env) == -1)
892 		fatal("failed to initialize configuration");
893 
894 	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
895 		fatal("unknown user %s", VMD_USER);
896 
897 	/* First proc runs as root without pledge but in default chroot */
898 	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
899 	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
900 
901 	/*
902 	 * If we're launching a new vm or its device, we short out here.
903 	 */
904 	if (vm_launch == VMD_LAUNCH_VM) {
905 		vm_main(vm_fd, vmm_fd);
906 		/* NOTREACHED */
907 	} else if (vm_launch == VMD_LAUNCH_DEV) {
908 		if (dev_type == VMD_DEVTYPE_NET) {
909 			vionet_main(vm_fd, vmm_fd);
910 			/* NOTREACHED */
911 		} else if (dev_type == VMD_DEVTYPE_DISK) {
912 			vioblk_main(vm_fd, vmm_fd);
913 			/* NOTREACHED */
914 		}
915 		fatalx("unsupported device type '%c'", dev_type);
916 	}
917 
918 	/* Open /dev/vmm early. */
919 	if (env->vmd_noaction == 0 && proc_id == PROC_PARENT) {
920 		env->vmd_fd = open(VMM_NODE, O_RDWR);
921 		if (env->vmd_fd == -1)
922 			fatal("%s", VMM_NODE);
923 	}
924 
925 	/* Configure the control socket */
926 	ps->ps_csock.cs_name = SOCKET_NAME;
927 	TAILQ_INIT(&ps->ps_rcsocks);
928 
929 	/* Configuration will be parsed after forking the children */
930 	env->vmd_conffile = conffile;
931 
932 	if (env->vmd_noaction)
933 		ps->ps_noaction = 1;
934 	ps->ps_instance = proc_instance;
935 	if (title != NULL)
936 		ps->ps_title[proc_id] = title;
937 
938 	/* only the parent returns */
939 	proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv,
940 	    proc_id);
941 
942 	log_procinit("parent");
943 	if (!env->vmd_debug && daemon(0, 0) == -1)
944 		fatal("can't daemonize");
945 
946 	if (ps->ps_noaction == 0)
947 		log_info("startup");
948 
949 	event_init();
950 
951 	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
952 	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
953 	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
954 	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
955 	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
956 
957 	signal_add(&ps->ps_evsigint, NULL);
958 	signal_add(&ps->ps_evsigterm, NULL);
959 	signal_add(&ps->ps_evsighup, NULL);
960 	signal_add(&ps->ps_evsigpipe, NULL);
961 	signal_add(&ps->ps_evsigusr1, NULL);
962 
963 	if (!env->vmd_noaction)
964 		proc_connect(ps);
965 
966 	if (vmd_configure() == -1)
967 		fatalx("configuration failed");
968 
969 	event_dispatch();
970 
971 	log_debug("parent exiting");
972 
973 	return (0);
974 }
975 
976 void
977 start_vm_batch(int fd, short type, void *args)
978 {
979 	int		i = 0;
980 	struct vmd_vm	*vm;
981 
982 	log_debug("%s: starting batch of %d vms", __func__,
983 	    env->vmd_cfg.parallelism);
984 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
985 		if (!(vm->vm_state & VM_STATE_WAITING)) {
986 			log_debug("%s: not starting vm %s (disabled)",
987 			    __func__,
988 			    vm->vm_params.vmc_params.vcp_name);
989 			continue;
990 		}
991 		i++;
992 		if (i > env->vmd_cfg.parallelism) {
993 			evtimer_add(&staggered_start_timer,
994 			    &env->vmd_cfg.delay);
995 			break;
996 		}
997 		vm->vm_state &= ~VM_STATE_WAITING;
998 		config_setvm(&env->vmd_ps, vm, -1, vm->vm_params.vmc_owner.uid);
999 	}
1000 	log_debug("%s: done starting vms", __func__);
1001 }
1002 
1003 int
1004 vmd_configure(void)
1005 {
1006 	int			ncpus;
1007 	struct vmd_switch	*vsw;
1008 	int ncpu_mib[] = {CTL_HW, HW_NCPUONLINE};
1009 	size_t ncpus_sz = sizeof(ncpus);
1010 
1011 	if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
1012 		fatal("open %s", PATH_PTMDEV);
1013 
1014 	/*
1015 	 * pledge in the parent process:
1016 	 * stdio - for malloc and basic I/O including events.
1017 	 * rpath - for reload to open and read the configuration files.
1018 	 * wpath - for opening disk images and tap devices.
1019 	 * tty - for openpty and TIOCUCNTL.
1020 	 * proc - run kill to terminate its children safely.
1021 	 * sendfd - for disks, interfaces and other fds.
1022 	 * recvfd - for send and receive.
1023 	 * getpw - lookup user or group id by name.
1024 	 * chown, fattr - change tty ownership
1025 	 * flock - locking disk files
1026 	 */
1027 	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
1028 	    " chown fattr flock", NULL) == -1)
1029 		fatal("pledge");
1030 
1031 	if (parse_config(env->vmd_conffile) == -1) {
1032 		proc_kill(&env->vmd_ps);
1033 		exit(1);
1034 	}
1035 
1036 	if (env->vmd_noaction) {
1037 		fprintf(stderr, "configuration OK\n");
1038 		proc_kill(&env->vmd_ps);
1039 		exit(0);
1040 	}
1041 
1042 	/* Send VMM device fd to vmm proc. */
1043 	proc_compose_imsg(&env->vmd_ps, PROC_VMM, -1,
1044 	    IMSG_VMDOP_RECEIVE_VMM_FD, -1, env->vmd_fd, NULL, 0);
1045 
1046 	/* Send shared global configuration to all children */
1047 	if (config_setconfig(env) == -1)
1048 		return (-1);
1049 
1050 	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1051 		if (vsw->sw_running)
1052 			continue;
1053 		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1054 			log_warn("%s: failed to create switch %s",
1055 			    __func__, vsw->sw_name);
1056 			switch_remove(vsw);
1057 			return (-1);
1058 		}
1059 	}
1060 
1061 	if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) {
1062 		env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY;
1063 		if (sysctl(ncpu_mib, nitems(ncpu_mib), &ncpus, &ncpus_sz, NULL, 0) == -1)
1064 			ncpus = 1;
1065 		env->vmd_cfg.parallelism = ncpus;
1066 		log_debug("%s: setting staggered start configuration to "
1067 		    "parallelism: %d and delay: %lld",
1068 		    __func__, ncpus, (long long) env->vmd_cfg.delay.tv_sec);
1069 	}
1070 
1071 	log_debug("%s: starting vms in staggered fashion", __func__);
1072 	evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1073 	/* start first batch */
1074 	start_vm_batch(0, 0, NULL);
1075 
1076 	return (0);
1077 }
1078 
1079 int
1080 vmd_reload(unsigned int reset, const char *filename)
1081 {
1082 	struct vmd_vm		*vm, *next_vm;
1083 	struct vmd_switch	*vsw;
1084 	int			 reload = 0;
1085 
1086 	/* Switch back to the default config file */
1087 	if (filename == NULL || *filename == '\0') {
1088 		filename = env->vmd_conffile;
1089 		reload = 1;
1090 	}
1091 
1092 	log_debug("%s: level %d config file %s", __func__, reset, filename);
1093 
1094 	if (reset) {
1095 		/* Purge the configuration */
1096 		config_purge(env, reset);
1097 		config_setreset(env, reset);
1098 	} else {
1099 		/*
1100 		 * Load or reload the configuration.
1101 		 *
1102 		 * Reloading removes all non-running VMs before processing the
1103 		 * config file, whereas loading only adds to the existing list
1104 		 * of VMs.
1105 		 */
1106 
1107 		if (reload) {
1108 			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
1109 			    next_vm) {
1110 				if (!(vm->vm_state & VM_STATE_RUNNING)) {
1111 					DPRINTF("%s: calling vm_remove",
1112 					    __func__);
1113 					vm_remove(vm, __func__);
1114 				}
1115 			}
1116 		}
1117 
1118 		if (parse_config(filename) == -1) {
1119 			log_debug("%s: failed to load config file %s",
1120 			    __func__, filename);
1121 			return (-1);
1122 		}
1123 
1124 		if (reload) {
1125 			/* Update shared global configuration in all children */
1126 			if (config_setconfig(env) == -1)
1127 				return (-1);
1128 		}
1129 
1130 		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1131 			if (vsw->sw_running)
1132 				continue;
1133 			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1134 				log_warn("%s: failed to create switch %s",
1135 				    __func__, vsw->sw_name);
1136 				switch_remove(vsw);
1137 				return (-1);
1138 			}
1139 		}
1140 
1141 		log_debug("%s: starting vms in staggered fashion", __func__);
1142 		evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1143 		/* start first batch */
1144 		start_vm_batch(0, 0, NULL);
1145 
1146 		}
1147 
1148 	return (0);
1149 }
1150 
1151 void
1152 vmd_shutdown(void)
1153 {
1154 	struct vmd_vm *vm, *vm_next;
1155 
1156 	log_debug("%s: performing shutdown", __func__);
1157 
1158 	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
1159 		vm_remove(vm, __func__);
1160 	}
1161 
1162 	proc_kill(&env->vmd_ps);
1163 	free(env);
1164 
1165 	log_warnx("parent terminating");
1166 	exit(0);
1167 }
1168 
1169 struct vmd_vm *
1170 vm_getbyvmid(uint32_t vmid)
1171 {
1172 	struct vmd_vm	*vm;
1173 
1174 	if (vmid == 0)
1175 		return (NULL);
1176 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1177 		if (vm->vm_vmid == vmid)
1178 			return (vm);
1179 	}
1180 
1181 	return (NULL);
1182 }
1183 
1184 struct vmd_vm *
1185 vm_getbyid(uint32_t id)
1186 {
1187 	struct vmd_vm	*vm;
1188 
1189 	if (id == 0)
1190 		return (NULL);
1191 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1192 		if (vm->vm_params.vmc_params.vcp_id == id)
1193 			return (vm);
1194 	}
1195 
1196 	return (NULL);
1197 }
1198 
1199 uint32_t
1200 vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1201 {
1202 	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1203 		return (0);
1204 	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1205 	    id, vm->vm_vmid);
1206 	return (vm->vm_vmid);
1207 }
1208 
1209 uint32_t
1210 vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1211 {
1212 	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1213 		return (0);
1214 	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1215 	    vmid, vm->vm_params.vmc_params.vcp_id);
1216 	return (vm->vm_params.vmc_params.vcp_id);
1217 }
1218 
1219 struct vmd_vm *
1220 vm_getbyname(const char *name)
1221 {
1222 	struct vmd_vm	*vm;
1223 
1224 	if (name == NULL)
1225 		return (NULL);
1226 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1227 		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1228 			return (vm);
1229 	}
1230 
1231 	return (NULL);
1232 }
1233 
1234 struct vmd_vm *
1235 vm_getbypid(pid_t pid)
1236 {
1237 	struct vmd_vm	*vm;
1238 
1239 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1240 		if (vm->vm_pid == pid)
1241 			return (vm);
1242 	}
1243 
1244 	return (NULL);
1245 }
1246 
1247 void
1248 vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1249 {
1250 	struct privsep	*ps = &env->vmd_ps;
1251 	unsigned int	 i, j;
1252 
1253 	if (vm == NULL)
1254 		return;
1255 
1256 	log_debug("%s: %s %s stopping vm %d%s",
1257 	    __func__, ps->ps_title[privsep_process], caller,
1258 	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1259 
1260 	vm->vm_state &= ~(VM_STATE_RECEIVED | VM_STATE_RUNNING
1261 	    | VM_STATE_SHUTDOWN);
1262 
1263 	if (vm->vm_iev.ibuf.fd != -1) {
1264 		event_del(&vm->vm_iev.ev);
1265 		close(vm->vm_iev.ibuf.fd);
1266 	}
1267 	for (i = 0; i < VM_MAX_DISKS_PER_VM; i++) {
1268 		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
1269 			if (vm->vm_disks[i][j] != -1) {
1270 				close(vm->vm_disks[i][j]);
1271 				vm->vm_disks[i][j] = -1;
1272 			}
1273 		}
1274 	}
1275 	for (i = 0; i < VM_MAX_NICS_PER_VM; i++) {
1276 		if (vm->vm_ifs[i].vif_fd != -1) {
1277 			close(vm->vm_ifs[i].vif_fd);
1278 			vm->vm_ifs[i].vif_fd = -1;
1279 		}
1280 		free(vm->vm_ifs[i].vif_name);
1281 		free(vm->vm_ifs[i].vif_switch);
1282 		free(vm->vm_ifs[i].vif_group);
1283 		vm->vm_ifs[i].vif_name = NULL;
1284 		vm->vm_ifs[i].vif_switch = NULL;
1285 		vm->vm_ifs[i].vif_group = NULL;
1286 	}
1287 	if (vm->vm_kernel != -1) {
1288 		close(vm->vm_kernel);
1289 		vm->vm_kernel = -1;
1290 	}
1291 	if (vm->vm_cdrom != -1) {
1292 		close(vm->vm_cdrom);
1293 		vm->vm_cdrom = -1;
1294 	}
1295 	if (!keeptty) {
1296 		vm_closetty(vm);
1297 		vm->vm_uid = 0;
1298 	}
1299 }
1300 
1301 void
1302 vm_remove(struct vmd_vm *vm, const char *caller)
1303 {
1304 	struct privsep	*ps = &env->vmd_ps;
1305 
1306 	if (vm == NULL)
1307 		return;
1308 
1309 	log_debug("%s: %s %s removing vm %d from running config",
1310 	    __func__, ps->ps_title[privsep_process], caller,
1311 	    vm->vm_vmid);
1312 
1313 	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1314 
1315 	vm_stop(vm, 0, caller);
1316 	if (vm->vm_kernel_path != NULL && !vm->vm_from_config)
1317 		free(vm->vm_kernel_path);
1318 	free(vm);
1319 }
1320 
1321 int
1322 vm_claimid(const char *name, int uid, uint32_t *id)
1323 {
1324 	struct name2id *n2i = NULL;
1325 
1326 	TAILQ_FOREACH(n2i, env->vmd_known, entry)
1327 		if (strcmp(n2i->name, name) == 0 && n2i->uid == uid)
1328 			goto out;
1329 
1330 	if (++env->vmd_nvm == 0) {
1331 		log_warnx("too many vms");
1332 		return (-1);
1333 	}
1334 	if ((n2i = calloc(1, sizeof(struct name2id))) == NULL) {
1335 		log_warnx("could not alloc vm name");
1336 		return (-1);
1337 	}
1338 	n2i->id = env->vmd_nvm;
1339 	n2i->uid = uid;
1340 	if (strlcpy(n2i->name, name, sizeof(n2i->name)) >= sizeof(n2i->name)) {
1341 		log_warnx("vm name too long");
1342 		free(n2i);
1343 		return (-1);
1344 	}
1345 	TAILQ_INSERT_TAIL(env->vmd_known, n2i, entry);
1346 
1347 out:
1348 	*id = n2i->id;
1349 	return (0);
1350 }
1351 
1352 int
1353 vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1354     struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1355 {
1356 	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1357 	struct vm_create_params	*vcp = &vmc->vmc_params;
1358 	struct vmop_owner	*vmo = NULL;
1359 	uint32_t		 nid, rng;
1360 	unsigned int		 i, j;
1361 	struct vmd_switch	*sw;
1362 	char			*s;
1363 	int			 ret = 0;
1364 
1365 	/* Check if this is an instance of another VM */
1366 	if ((ret = vm_instance(ps, &vm_parent, vmc, uid)) != 0) {
1367 		errno = ret; /* XXX might set invalid errno */
1368 		return (-1);
1369 	}
1370 
1371 	errno = 0;
1372 	*ret_vm = NULL;
1373 
1374 	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1375 	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1376 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1377 		    uid) != 0) {
1378 			errno = EPERM;
1379 			goto fail;
1380 		}
1381 		vm->vm_kernel = vmc->vmc_kernel;
1382 		*ret_vm = vm;
1383 		errno = EALREADY;
1384 		goto fail;
1385 	}
1386 
1387 	if (vm_parent != NULL)
1388 		vmo = &vm_parent->vm_params.vmc_insowner;
1389 
1390 	/* non-root users can only start existing VMs or instances */
1391 	if (vm_checkperm(NULL, vmo, uid) != 0) {
1392 		log_warnx("permission denied");
1393 		errno = EPERM;
1394 		goto fail;
1395 	}
1396 	if (vmc->vmc_flags == 0) {
1397 		log_warnx("invalid configuration, no devices");
1398 		errno = VMD_DISK_MISSING;
1399 		goto fail;
1400 	}
1401 	if (vcp->vcp_ncpus == 0)
1402 		vcp->vcp_ncpus = 1;
1403 	if (vcp->vcp_memranges[0].vmr_size == 0)
1404 		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1405 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1406 		log_warnx("invalid number of CPUs");
1407 		goto fail;
1408 	} else if (vmc->vmc_ndisks > VM_MAX_DISKS_PER_VM) {
1409 		log_warnx("invalid number of disks");
1410 		goto fail;
1411 	} else if (vmc->vmc_nnics > VM_MAX_NICS_PER_VM) {
1412 		log_warnx("invalid number of interfaces");
1413 		goto fail;
1414 	} else if (vmc->vmc_kernel == -1 && vmc->vmc_ndisks == 0
1415 	    && strlen(vmc->vmc_cdrom) == 0) {
1416 		log_warnx("no kernel or disk/cdrom specified");
1417 		goto fail;
1418 	} else if (strlen(vcp->vcp_name) == 0) {
1419 		log_warnx("invalid VM name");
1420 		goto fail;
1421 	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1422 	    *vcp->vcp_name == '_') {
1423 		log_warnx("invalid VM name");
1424 		goto fail;
1425 	} else {
1426 		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1427 			if (!(isalnum((unsigned char)*s) || *s == '.' || \
1428 			    *s == '-' || *s == '_')) {
1429 				log_warnx("invalid VM name");
1430 				goto fail;
1431 			}
1432 		}
1433 	}
1434 
1435 	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1436 		goto fail;
1437 
1438 	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1439 	vmc = &vm->vm_params;
1440 	vcp = &vmc->vmc_params;
1441 	vm->vm_pid = -1;
1442 	vm->vm_tty = -1;
1443 	vm->vm_receive_fd = -1;
1444 	vm->vm_kernel = -1;
1445 	vm->vm_state &= ~VM_STATE_PAUSED;
1446 
1447 	if (vmc->vmc_kernel > -1)
1448 		vm->vm_kernel = vmc->vmc_kernel;
1449 
1450 	for (i = 0; i < VM_MAX_DISKS_PER_VM; i++)
1451 		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
1452 			vm->vm_disks[i][j] = -1;
1453 	for (i = 0; i < VM_MAX_NICS_PER_VM; i++)
1454 		vm->vm_ifs[i].vif_fd = -1;
1455 	for (i = 0; i < vmc->vmc_nnics; i++) {
1456 		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1457 			/* inherit per-interface flags from the switch */
1458 			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1459 		}
1460 
1461 		/*
1462 		 * If the MAC address is zero, always randomize it in vmd(8)
1463 		 * because we cannot rely on the guest OS to do the right
1464 		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1465 		 * from the kernel, incremented by one to differentiate
1466 		 * the source.
1467 		 */
1468 		if (memcmp(zero_mac, &vmc->vmc_macs[i], ETHER_ADDR_LEN) == 0) {
1469 			rng = arc4random();
1470 			vmc->vmc_macs[i][0] = 0xfe;
1471 			vmc->vmc_macs[i][1] = 0xe1;
1472 			vmc->vmc_macs[i][2] = 0xba + 1;
1473 			vmc->vmc_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1474 			vmc->vmc_macs[i][4] = rng;
1475 			vmc->vmc_macs[i][5] = rng >> 8;
1476 		}
1477 	}
1478 	vm->vm_cdrom = -1;
1479 	vm->vm_iev.ibuf.fd = -1;
1480 
1481 	/*
1482 	 * Assign a new internal Id if not specified and we succeed in
1483 	 * claiming a new Id.
1484 	 */
1485 	if (id != 0)
1486 		vm->vm_vmid = id;
1487 	else if (vm_claimid(vcp->vcp_name, uid, &nid) == -1)
1488 		goto fail;
1489 	else
1490 		vm->vm_vmid = nid;
1491 
1492 	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1493 	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1494 
1495 	*ret_vm = vm;
1496 	return (0);
1497  fail:
1498 	if (errno == 0)
1499 		errno = EINVAL;
1500 	return (-1);
1501 }
1502 
1503 int
1504 vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1505     struct vmop_create_params *vmc, uid_t uid)
1506 {
1507 	char			*name;
1508 	struct vm_create_params	*vcp = &vmc->vmc_params;
1509 	struct vmop_create_params *vmcp;
1510 	struct vm_create_params	*vcpp;
1511 	unsigned int		 i, j;
1512 
1513 	/* return without error if the parent is NULL (nothing to inherit) */
1514 	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1515 	    vmc->vmc_instance[0] == '\0')
1516 		return (0);
1517 
1518 	if ((*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL) {
1519 		return (VMD_PARENT_INVALID);
1520 	}
1521 
1522 	vmcp = &(*vm_parent)->vm_params;
1523 	vcpp = &vmcp->vmc_params;
1524 
1525 	/* Are we allowed to create an instance from this VM? */
1526 	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1527 		log_warnx("vm \"%s\" no permission to create vm instance",
1528 		    vcpp->vcp_name);
1529 		return (ENAMETOOLONG);
1530 	}
1531 
1532 	name = vcp->vcp_name;
1533 
1534 	if (vm_getbyname(vcp->vcp_name) != NULL ||
1535 	    vm_getbyvmid(vcp->vcp_id) != NULL) {
1536 		return (EPROCLIM);
1537 	}
1538 
1539 	/* CPU */
1540 	if (vcp->vcp_ncpus == 0)
1541 		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1542 	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1543 	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1544 		log_warnx("vm \"%s\" no permission to set cpus", name);
1545 		return (EPERM);
1546 	}
1547 
1548 	/* memory */
1549 	if (vcp->vcp_memranges[0].vmr_size == 0)
1550 		vcp->vcp_memranges[0].vmr_size =
1551 		    vcpp->vcp_memranges[0].vmr_size;
1552 	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1553 	    vcp->vcp_memranges[0].vmr_size !=
1554 	    vcpp->vcp_memranges[0].vmr_size) {
1555 		log_warnx("vm \"%s\" no permission to set memory", name);
1556 		return (EPERM);
1557 	}
1558 
1559 	/* disks cannot be inherited */
1560 	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1561 	    vmc->vmc_ndisks) {
1562 		log_warnx("vm \"%s\" no permission to set disks", name);
1563 		return (EPERM);
1564 	}
1565 	for (i = 0; i < vmc->vmc_ndisks; i++) {
1566 		/* Check if this disk is already used in the parent */
1567 		for (j = 0; j < vmcp->vmc_ndisks; j++) {
1568 			if (strcmp(vmc->vmc_disks[i],
1569 			    vmcp->vmc_disks[j]) == 0) {
1570 				log_warnx("vm \"%s\" disk %s cannot be reused",
1571 				    name, vmc->vmc_disks[i]);
1572 				return (EBUSY);
1573 			}
1574 		}
1575 		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1576 	}
1577 
1578 	/* interfaces */
1579 	if (vmc->vmc_nnics > 0 &&
1580 	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1581 	    vmc->vmc_nnics != vmcp->vmc_nnics) {
1582 		log_warnx("vm \"%s\" no permission to set interfaces", name);
1583 		return (EPERM);
1584 	}
1585 	for (i = 0; i < vmcp->vmc_nnics; i++) {
1586 		/* Interface got overwritten */
1587 		if (i < vmc->vmc_nnics)
1588 			continue;
1589 
1590 		/* Copy interface from parent */
1591 		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1592 		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1593 		    sizeof(vmc->vmc_ifnames[i]));
1594 		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1595 		    sizeof(vmc->vmc_ifswitch[i]));
1596 		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1597 		    sizeof(vmc->vmc_ifgroup[i]));
1598 		memcpy(vmc->vmc_macs[i], vmcp->vmc_macs[i],
1599 		    sizeof(vmc->vmc_macs[i]));
1600 		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1601 		vmc->vmc_nnics++;
1602 	}
1603 	for (i = 0; i < vmc->vmc_nnics; i++) {
1604 		for (j = 0; j < vmcp->vmc_nnics; j++) {
1605 			if (memcmp(zero_mac, vmc->vmc_macs[i],
1606 			    sizeof(vmc->vmc_macs[i])) != 0 &&
1607 			    memcmp(vmcp->vmc_macs[i], vmc->vmc_macs[i],
1608 			    sizeof(vmc->vmc_macs[i])) != 0) {
1609 				log_warnx("vm \"%s\" lladdr cannot be reused",
1610 				    name);
1611 				return (EBUSY);
1612 			}
1613 			if (strlen(vmc->vmc_ifnames[i]) &&
1614 			    strcmp(vmc->vmc_ifnames[i],
1615 			    vmcp->vmc_ifnames[j]) == 0) {
1616 				log_warnx("vm \"%s\" %s cannot be reused",
1617 				    vmc->vmc_ifnames[i], name);
1618 				return (EBUSY);
1619 			}
1620 		}
1621 	}
1622 
1623 	/* kernel */
1624 	if (vmc->vmc_kernel > -1 || ((*vm_parent)->vm_kernel_path != NULL &&
1625 		strnlen((*vm_parent)->vm_kernel_path, PATH_MAX) < PATH_MAX)) {
1626 		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1627 			log_warnx("vm \"%s\" no permission to set boot image",
1628 			    name);
1629 			return (EPERM);
1630 		}
1631 		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1632 	}
1633 
1634 	/* cdrom */
1635 	if (strlen(vmc->vmc_cdrom) > 0) {
1636 		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1637 			log_warnx("vm \"%s\" no permission to set cdrom", name);
1638 			return (EPERM);
1639 		}
1640 		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1641 	} else if (strlcpy(vmc->vmc_cdrom, vmcp->vmc_cdrom,
1642 	    sizeof(vmc->vmc_cdrom)) >= sizeof(vmc->vmc_cdrom)) {
1643 		log_warnx("vm \"%s\" cdrom name too long", name);
1644 		return (EINVAL);
1645 	}
1646 
1647 	/* user */
1648 	if (vmc->vmc_owner.uid == 0)
1649 		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1650 	else if (vmc->vmc_owner.uid != uid &&
1651 	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1652 		log_warnx("vm \"%s\" user mismatch", name);
1653 		return (EPERM);
1654 	}
1655 
1656 	/* group */
1657 	if (vmc->vmc_owner.gid == 0)
1658 		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1659 	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1660 		log_warnx("vm \"%s\" group mismatch", name);
1661 		return (EPERM);
1662 	}
1663 
1664 	/* child instances */
1665 	if (vmc->vmc_insflags) {
1666 		log_warnx("vm \"%s\" cannot change instance permissions", name);
1667 		return (EPERM);
1668 	}
1669 	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1670 		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1671 		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1672 		vmc->vmc_insflags = vmcp->vmc_insflags;
1673 	} else {
1674 		vmc->vmc_insowner.gid = 0;
1675 		vmc->vmc_insowner.uid = 0;
1676 		vmc->vmc_insflags = 0;
1677 	}
1678 
1679 	/* finished, remove instance flags */
1680 	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1681 
1682 	return (0);
1683 }
1684 
1685 /*
1686  * vm_checkperm
1687  *
1688  * Checks if the user represented by the 'uid' parameter is allowed to
1689  * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1690  * console.)
1691  *
1692  * Parameters:
1693  *  vm: the VM whose permission is to be checked
1694  *  vmo: the required uid/gid to be checked
1695  *  uid: the user ID of the user making the request
1696  *
1697  * Return values:
1698  *   0: the permission should be granted
1699  *  -1: the permission check failed (also returned if vm == null)
1700  */
1701 int
1702 vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1703 {
1704 	struct group	*gr;
1705 	struct passwd	*pw;
1706 	char		**grmem;
1707 
1708 	/* root has no restrictions */
1709 	if (uid == 0)
1710 		return (0);
1711 
1712 	if (vmo == NULL)
1713 		return (-1);
1714 
1715 	/* check user */
1716 	if (vm == NULL) {
1717 		if  (vmo->uid == uid)
1718 			return (0);
1719 	} else {
1720 		/*
1721 		 * check user of running vm (the owner of a running vm can
1722 		 * be different to (or more specific than) the configured owner.
1723 		 */
1724 		if (((vm->vm_state & VM_STATE_RUNNING) && vm->vm_uid == uid) ||
1725 		    (!(vm->vm_state & VM_STATE_RUNNING) && vmo->uid == uid))
1726 			return (0);
1727 	}
1728 
1729 	/* check groups */
1730 	if (vmo->gid != -1) {
1731 		if ((pw = getpwuid(uid)) == NULL)
1732 			return (-1);
1733 		if (pw->pw_gid == vmo->gid)
1734 			return (0);
1735 		if ((gr = getgrgid(vmo->gid)) != NULL) {
1736 			for (grmem = gr->gr_mem; *grmem; grmem++)
1737 				if (strcmp(*grmem, pw->pw_name) == 0)
1738 					return (0);
1739 		}
1740 	}
1741 
1742 	return (-1);
1743 }
1744 
1745 /*
1746  * vm_checkinsflag
1747  *
1748  * Checks whether the non-root user is allowed to set an instance option.
1749  *
1750  * Parameters:
1751  *  vmc: the VM create parameters
1752  *  flag: the flag to be checked
1753  *  uid: the user ID of the user making the request
1754  *
1755  * Return values:
1756  *   0: the permission should be granted
1757  *  -1: the permission check failed (also returned if vm == null)
1758  */
1759 int
1760 vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1761 {
1762 	/* root has no restrictions */
1763 	if (uid == 0)
1764 		return (0);
1765 
1766 	if ((vmc->vmc_insflags & flag) == 0)
1767 		return (-1);
1768 
1769 	return (0);
1770 }
1771 
1772 /*
1773  * vm_checkaccess
1774  *
1775  * Checks if the user represented by the 'uid' parameter is allowed to
1776  * access the file described by the 'path' parameter.
1777  *
1778  * Parameters:
1779  *  fd: the file descriptor of the opened file
1780  *  uflag: check if the userid has access to the file
1781  *  uid: the user ID of the user making the request
1782  *  amode: the access flags of R_OK and W_OK
1783  *
1784  * Return values:
1785  *   0: the permission should be granted
1786  *  -1: the permission check failed
1787  */
1788 int
1789 vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1790 {
1791 	struct group	*gr;
1792 	struct passwd	*pw;
1793 	char		**grmem;
1794 	struct stat	 st;
1795 	mode_t		 mode;
1796 
1797 	if (fd == -1)
1798 		return (-1);
1799 
1800 	/*
1801 	 * File has to be accessible and a regular file
1802 	 */
1803 	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1804 		return (-1);
1805 
1806 	/* root has no restrictions */
1807 	if (uid == 0 || uflag == 0)
1808 		return (0);
1809 
1810 	/* check other */
1811 	mode = amode & W_OK ? S_IWOTH : 0;
1812 	mode |= amode & R_OK ? S_IROTH : 0;
1813 	if ((st.st_mode & mode) == mode)
1814 		return (0);
1815 
1816 	/* check user */
1817 	mode = amode & W_OK ? S_IWUSR : 0;
1818 	mode |= amode & R_OK ? S_IRUSR : 0;
1819 	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1820 		return (0);
1821 
1822 	/* check groups */
1823 	mode = amode & W_OK ? S_IWGRP : 0;
1824 	mode |= amode & R_OK ? S_IRGRP : 0;
1825 	if ((st.st_mode & mode) != mode)
1826 		return (-1);
1827 	if ((pw = getpwuid(uid)) == NULL)
1828 		return (-1);
1829 	if (pw->pw_gid == st.st_gid)
1830 		return (0);
1831 	if ((gr = getgrgid(st.st_gid)) != NULL) {
1832 		for (grmem = gr->gr_mem; *grmem; grmem++)
1833 			if (strcmp(*grmem, pw->pw_name) == 0)
1834 				return (0);
1835 	}
1836 
1837 	return (-1);
1838 }
1839 
1840 int
1841 vm_opentty(struct vmd_vm *vm)
1842 {
1843 	struct ptmget		 ptm;
1844 	struct stat		 st;
1845 	struct group		*gr;
1846 	uid_t			 uid;
1847 	gid_t			 gid;
1848 	mode_t			 mode;
1849 	int			 on;
1850 
1851 	/*
1852 	 * Open tty with pre-opened PTM fd
1853 	 */
1854 	if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1))
1855 		return (-1);
1856 
1857 	/*
1858 	 * We use user ioctl(2) mode to pass break commands.
1859 	 */
1860 	on = 1;
1861 	if (ioctl(ptm.cfd, TIOCUCNTL, &on) == -1)
1862 		fatal("could not enable user ioctl mode");
1863 
1864 	vm->vm_tty = ptm.cfd;
1865 	close(ptm.sfd);
1866 	if (strlcpy(vm->vm_ttyname, ptm.sn, sizeof(vm->vm_ttyname))
1867 	    >= sizeof(vm->vm_ttyname)) {
1868 		log_warnx("%s: truncated ttyname", __func__);
1869 		goto fail;
1870 	}
1871 
1872 	uid = vm->vm_uid;
1873 	gid = vm->vm_params.vmc_owner.gid;
1874 
1875 	if (vm->vm_params.vmc_owner.gid != -1) {
1876 		mode = 0660;
1877 	} else if ((gr = getgrnam("tty")) != NULL) {
1878 		gid = gr->gr_gid;
1879 		mode = 0620;
1880 	} else {
1881 		mode = 0600;
1882 		gid = 0;
1883 	}
1884 
1885 	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1886 	    __func__, vm->vm_params.vmc_params.vcp_name,
1887 	    vm->vm_ttyname, uid, gid, mode);
1888 
1889 	/*
1890 	 * Change ownership and mode of the tty as required.
1891 	 * Loosely based on the implementation of sshpty.c
1892 	 */
1893 	if (stat(vm->vm_ttyname, &st) == -1)
1894 		goto fail;
1895 
1896 	if (st.st_uid != uid || st.st_gid != gid) {
1897 		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1898 			log_warn("chown %s %d %d failed, uid %d",
1899 			    vm->vm_ttyname, uid, gid, getuid());
1900 
1901 			/* Ignore failure on read-only filesystems */
1902 			if (!((errno == EROFS) &&
1903 			    (st.st_uid == uid || st.st_uid == 0)))
1904 				goto fail;
1905 		}
1906 	}
1907 
1908 	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1909 		if (chmod(vm->vm_ttyname, mode) == -1) {
1910 			log_warn("chmod %s %o failed, uid %d",
1911 			    vm->vm_ttyname, mode, getuid());
1912 
1913 			/* Ignore failure on read-only filesystems */
1914 			if (!((errno == EROFS) &&
1915 			    (st.st_uid == uid || st.st_uid == 0)))
1916 				goto fail;
1917 		}
1918 	}
1919 
1920 	return (0);
1921  fail:
1922 	vm_closetty(vm);
1923 	return (-1);
1924 }
1925 
1926 void
1927 vm_closetty(struct vmd_vm *vm)
1928 {
1929 	if (vm->vm_tty != -1) {
1930 		/* Release and close the tty */
1931 		if (fchown(vm->vm_tty, 0, 0) == -1)
1932 			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1933 		if (fchmod(vm->vm_tty, 0666) == -1)
1934 			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1935 		close(vm->vm_tty);
1936 		vm->vm_tty = -1;
1937 	}
1938 	memset(&vm->vm_ttyname, 0, sizeof(vm->vm_ttyname));
1939 }
1940 
1941 void
1942 switch_remove(struct vmd_switch *vsw)
1943 {
1944 	if (vsw == NULL)
1945 		return;
1946 
1947 	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1948 
1949 	free(vsw->sw_group);
1950 	free(vsw->sw_name);
1951 	free(vsw);
1952 }
1953 
1954 struct vmd_switch *
1955 switch_getbyname(const char *name)
1956 {
1957 	struct vmd_switch	*vsw;
1958 
1959 	if (name == NULL)
1960 		return (NULL);
1961 	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1962 		if (strcmp(vsw->sw_name, name) == 0)
1963 			return (vsw);
1964 	}
1965 
1966 	return (NULL);
1967 }
1968 
1969 char *
1970 get_string(uint8_t *ptr, size_t len)
1971 {
1972 	size_t	 i;
1973 
1974 	for (i = 0; i < len; i++)
1975 		if (!isprint((unsigned char)ptr[i]))
1976 			break;
1977 
1978 	return strndup(ptr, i);
1979 }
1980 
1981 uint32_t
1982 prefixlen2mask(uint8_t prefixlen)
1983 {
1984 	if (prefixlen == 0)
1985 		return (0);
1986 
1987 	if (prefixlen > 32)
1988 		prefixlen = 32;
1989 
1990 	return (htonl(0xffffffff << (32 - prefixlen)));
1991 }
1992 
1993 void
1994 prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask)
1995 {
1996 	struct in6_addr	 s6;
1997 	int		 i;
1998 
1999 	if (prefixlen > 128)
2000 		prefixlen = 128;
2001 
2002 	memset(&s6, 0, sizeof(s6));
2003 	for (i = 0; i < prefixlen / 8; i++)
2004 		s6.s6_addr[i] = 0xff;
2005 	i = prefixlen % 8;
2006 	if (i)
2007 		s6.s6_addr[prefixlen / 8] = 0xff00 >> i;
2008 
2009 	memcpy(mask, &s6, sizeof(s6));
2010 }
2011 
2012 void
2013 getmonotime(struct timeval *tv)
2014 {
2015 	struct timespec	 ts;
2016 
2017 	if (clock_gettime(CLOCK_MONOTONIC, &ts))
2018 		fatal("clock_gettime");
2019 
2020 	TIMESPEC_TO_TIMEVAL(tv, &ts);
2021 }
2022 
2023 static inline void
2024 vm_terminate(struct vmd_vm *vm, const char *caller)
2025 {
2026 	if (vm->vm_from_config)
2027 		vm_stop(vm, 0, caller);
2028 	else {
2029 		/* vm_remove calls vm_stop */
2030 		vm_remove(vm, caller);
2031 	}
2032 }
2033 
2034 /*
2035  * Utility function for closing vm file descriptors. Assumes an fd of -1 was
2036  * already closed or never opened.
2037  *
2038  * Returns 0 on success, otherwise -1 on failure.
2039  */
2040 int
2041 close_fd(int fd)
2042 {
2043 	int	ret;
2044 
2045 	if (fd == -1)
2046 		return (0);
2047 
2048 #ifdef POSIX_CLOSE_RESTART
2049 	do { ret = close(fd); } while (ret == -1 && errno == EINTR);
2050 #else
2051 	ret = close(fd);
2052 #endif /* POSIX_CLOSE_RESTART */
2053 
2054 	if (ret == -1 && errno == EIO)
2055 		log_warn("%s(%d)", __func__, fd);
2056 
2057 	return (ret);
2058 }
2059