xref: /netbsd-src/sys/dev/nvmm/nvmm.c (revision eceb233b9bd0dfebb902ed73b531ae6964fa3f9b)
1 /*	$NetBSD: nvmm.c,v 1.41 2020/09/08 16:58:38 maxv Exp $	*/
2 
3 /*
4  * Copyright (c) 2018-2020 Maxime Villard, m00nbsd.net
5  * All rights reserved.
6  *
7  * This code is part of the NVMM hypervisor.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.41 2020/09/08 16:58:38 maxv Exp $");
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 
38 #include <sys/atomic.h>
39 #include <sys/cpu.h>
40 #include <sys/conf.h>
41 #include <sys/kmem.h>
42 #include <sys/module.h>
43 #include <sys/proc.h>
44 #include <sys/mman.h>
45 #include <sys/file.h>
46 #include <sys/filedesc.h>
47 #include <sys/device.h>
48 
49 #include <uvm/uvm_aobj.h>
50 #include <uvm/uvm_extern.h>
51 #include <uvm/uvm_page.h>
52 
53 #include "ioconf.h"
54 
55 #include <dev/nvmm/nvmm.h>
56 #include <dev/nvmm/nvmm_internal.h>
57 #include <dev/nvmm/nvmm_ioctl.h>
58 
59 static struct nvmm_machine machines[NVMM_MAX_MACHINES];
60 static volatile unsigned int nmachines __cacheline_aligned;
61 
62 static const struct nvmm_impl *nvmm_impl_list[] = {
63 #if defined(__x86_64__)
64 	&nvmm_x86_svm,	/* x86 AMD SVM */
65 	&nvmm_x86_vmx	/* x86 Intel VMX */
66 #endif
67 };
68 
69 static const struct nvmm_impl *nvmm_impl __read_mostly = NULL;
70 
71 static struct nvmm_owner root_owner;
72 
73 /* -------------------------------------------------------------------------- */
74 
75 static int
76 nvmm_machine_alloc(struct nvmm_machine **ret)
77 {
78 	struct nvmm_machine *mach;
79 	size_t i;
80 
81 	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
82 		mach = &machines[i];
83 
84 		rw_enter(&mach->lock, RW_WRITER);
85 		if (mach->present) {
86 			rw_exit(&mach->lock);
87 			continue;
88 		}
89 
90 		mach->present = true;
91 		mach->time = time_second;
92 		*ret = mach;
93 		atomic_inc_uint(&nmachines);
94 		return 0;
95 	}
96 
97 	return ENOBUFS;
98 }
99 
100 static void
101 nvmm_machine_free(struct nvmm_machine *mach)
102 {
103 	KASSERT(rw_write_held(&mach->lock));
104 	KASSERT(mach->present);
105 	mach->present = false;
106 	atomic_dec_uint(&nmachines);
107 }
108 
109 static int
110 nvmm_machine_get(struct nvmm_owner *owner, nvmm_machid_t machid,
111     struct nvmm_machine **ret, bool writer)
112 {
113 	struct nvmm_machine *mach;
114 	krw_t op = writer ? RW_WRITER : RW_READER;
115 
116 	if (__predict_false(machid >= NVMM_MAX_MACHINES)) {
117 		return EINVAL;
118 	}
119 	mach = &machines[machid];
120 
121 	rw_enter(&mach->lock, op);
122 	if (__predict_false(!mach->present)) {
123 		rw_exit(&mach->lock);
124 		return ENOENT;
125 	}
126 	if (__predict_false(mach->owner != owner && owner != &root_owner)) {
127 		rw_exit(&mach->lock);
128 		return EPERM;
129 	}
130 	*ret = mach;
131 
132 	return 0;
133 }
134 
135 static void
136 nvmm_machine_put(struct nvmm_machine *mach)
137 {
138 	rw_exit(&mach->lock);
139 }
140 
141 /* -------------------------------------------------------------------------- */
142 
143 static int
144 nvmm_vcpu_alloc(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
145     struct nvmm_cpu **ret)
146 {
147 	struct nvmm_cpu *vcpu;
148 
149 	if (cpuid >= NVMM_MAX_VCPUS) {
150 		return EINVAL;
151 	}
152 	vcpu = &mach->cpus[cpuid];
153 
154 	mutex_enter(&vcpu->lock);
155 	if (vcpu->present) {
156 		mutex_exit(&vcpu->lock);
157 		return EBUSY;
158 	}
159 
160 	vcpu->present = true;
161 	vcpu->comm = NULL;
162 	vcpu->hcpu_last = -1;
163 	*ret = vcpu;
164 	return 0;
165 }
166 
167 static void
168 nvmm_vcpu_free(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
169 {
170 	KASSERT(mutex_owned(&vcpu->lock));
171 	vcpu->present = false;
172 	if (vcpu->comm != NULL) {
173 		uvm_deallocate(kernel_map, (vaddr_t)vcpu->comm, PAGE_SIZE);
174 	}
175 }
176 
177 static int
178 nvmm_vcpu_get(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
179     struct nvmm_cpu **ret)
180 {
181 	struct nvmm_cpu *vcpu;
182 
183 	if (__predict_false(cpuid >= NVMM_MAX_VCPUS)) {
184 		return EINVAL;
185 	}
186 	vcpu = &mach->cpus[cpuid];
187 
188 	mutex_enter(&vcpu->lock);
189 	if (__predict_false(!vcpu->present)) {
190 		mutex_exit(&vcpu->lock);
191 		return ENOENT;
192 	}
193 	*ret = vcpu;
194 
195 	return 0;
196 }
197 
198 static void
199 nvmm_vcpu_put(struct nvmm_cpu *vcpu)
200 {
201 	mutex_exit(&vcpu->lock);
202 }
203 
204 /* -------------------------------------------------------------------------- */
205 
206 static void
207 nvmm_kill_machines(struct nvmm_owner *owner)
208 {
209 	struct nvmm_machine *mach;
210 	struct nvmm_cpu *vcpu;
211 	size_t i, j;
212 	int error;
213 
214 	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
215 		mach = &machines[i];
216 
217 		rw_enter(&mach->lock, RW_WRITER);
218 		if (!mach->present || mach->owner != owner) {
219 			rw_exit(&mach->lock);
220 			continue;
221 		}
222 
223 		/* Kill it. */
224 		for (j = 0; j < NVMM_MAX_VCPUS; j++) {
225 			error = nvmm_vcpu_get(mach, j, &vcpu);
226 			if (error)
227 				continue;
228 			(*nvmm_impl->vcpu_destroy)(mach, vcpu);
229 			nvmm_vcpu_free(mach, vcpu);
230 			nvmm_vcpu_put(vcpu);
231 			atomic_dec_uint(&mach->ncpus);
232 		}
233 		(*nvmm_impl->machine_destroy)(mach);
234 		uvmspace_free(mach->vm);
235 
236 		/* Drop the kernel UOBJ refs. */
237 		for (j = 0; j < NVMM_MAX_HMAPPINGS; j++) {
238 			if (!mach->hmap[j].present)
239 				continue;
240 			uao_detach(mach->hmap[j].uobj);
241 		}
242 
243 		nvmm_machine_free(mach);
244 
245 		rw_exit(&mach->lock);
246 	}
247 }
248 
249 /* -------------------------------------------------------------------------- */
250 
251 static int
252 nvmm_capability(struct nvmm_owner *owner, struct nvmm_ioc_capability *args)
253 {
254 	args->cap.version = NVMM_KERN_VERSION;
255 	args->cap.state_size = nvmm_impl->state_size;
256 	args->cap.max_machines = NVMM_MAX_MACHINES;
257 	args->cap.max_vcpus = NVMM_MAX_VCPUS;
258 	args->cap.max_ram = NVMM_MAX_RAM;
259 
260 	(*nvmm_impl->capability)(&args->cap);
261 
262 	return 0;
263 }
264 
265 static int
266 nvmm_machine_create(struct nvmm_owner *owner,
267     struct nvmm_ioc_machine_create *args)
268 {
269 	struct nvmm_machine *mach;
270 	int error;
271 
272 	error = nvmm_machine_alloc(&mach);
273 	if (error)
274 		return error;
275 
276 	/* Curproc owns the machine. */
277 	mach->owner = owner;
278 
279 	/* Zero out the host mappings. */
280 	memset(&mach->hmap, 0, sizeof(mach->hmap));
281 
282 	/* Create the machine vmspace. */
283 	mach->gpa_begin = 0;
284 	mach->gpa_end = NVMM_MAX_RAM;
285 	mach->vm = uvmspace_alloc(0, mach->gpa_end - mach->gpa_begin, false);
286 
287 	/* Create the comm uobj. */
288 	mach->commuobj = uao_create(NVMM_MAX_VCPUS * PAGE_SIZE, 0);
289 
290 	(*nvmm_impl->machine_create)(mach);
291 
292 	args->machid = mach->machid;
293 	nvmm_machine_put(mach);
294 
295 	return 0;
296 }
297 
298 static int
299 nvmm_machine_destroy(struct nvmm_owner *owner,
300     struct nvmm_ioc_machine_destroy *args)
301 {
302 	struct nvmm_machine *mach;
303 	struct nvmm_cpu *vcpu;
304 	int error;
305 	size_t i;
306 
307 	error = nvmm_machine_get(owner, args->machid, &mach, true);
308 	if (error)
309 		return error;
310 
311 	for (i = 0; i < NVMM_MAX_VCPUS; i++) {
312 		error = nvmm_vcpu_get(mach, i, &vcpu);
313 		if (error)
314 			continue;
315 
316 		(*nvmm_impl->vcpu_destroy)(mach, vcpu);
317 		nvmm_vcpu_free(mach, vcpu);
318 		nvmm_vcpu_put(vcpu);
319 		atomic_dec_uint(&mach->ncpus);
320 	}
321 
322 	(*nvmm_impl->machine_destroy)(mach);
323 
324 	/* Free the machine vmspace. */
325 	uvmspace_free(mach->vm);
326 
327 	/* Drop the kernel UOBJ refs. */
328 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
329 		if (!mach->hmap[i].present)
330 			continue;
331 		uao_detach(mach->hmap[i].uobj);
332 	}
333 
334 	nvmm_machine_free(mach);
335 	nvmm_machine_put(mach);
336 
337 	return 0;
338 }
339 
340 static int
341 nvmm_machine_configure(struct nvmm_owner *owner,
342     struct nvmm_ioc_machine_configure *args)
343 {
344 	struct nvmm_machine *mach;
345 	size_t allocsz;
346 	uint64_t op;
347 	void *data;
348 	int error;
349 
350 	op = NVMM_MACH_CONF_MD(args->op);
351 	if (__predict_false(op >= nvmm_impl->mach_conf_max)) {
352 		return EINVAL;
353 	}
354 
355 	allocsz = nvmm_impl->mach_conf_sizes[op];
356 	data = kmem_alloc(allocsz, KM_SLEEP);
357 
358 	error = nvmm_machine_get(owner, args->machid, &mach, true);
359 	if (error) {
360 		kmem_free(data, allocsz);
361 		return error;
362 	}
363 
364 	error = copyin(args->conf, data, allocsz);
365 	if (error) {
366 		goto out;
367 	}
368 
369 	error = (*nvmm_impl->machine_configure)(mach, op, data);
370 
371 out:
372 	nvmm_machine_put(mach);
373 	kmem_free(data, allocsz);
374 	return error;
375 }
376 
377 static int
378 nvmm_vcpu_create(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_create *args)
379 {
380 	struct nvmm_machine *mach;
381 	struct nvmm_cpu *vcpu;
382 	int error;
383 
384 	error = nvmm_machine_get(owner, args->machid, &mach, false);
385 	if (error)
386 		return error;
387 
388 	error = nvmm_vcpu_alloc(mach, args->cpuid, &vcpu);
389 	if (error)
390 		goto out;
391 
392 	/* Allocate the comm page. */
393 	uao_reference(mach->commuobj);
394 	error = uvm_map(kernel_map, (vaddr_t *)&vcpu->comm, PAGE_SIZE,
395 	    mach->commuobj, args->cpuid * PAGE_SIZE, 0, UVM_MAPFLAG(UVM_PROT_RW,
396 	    UVM_PROT_RW, UVM_INH_SHARE, UVM_ADV_RANDOM, 0));
397 	if (error) {
398 		uao_detach(mach->commuobj);
399 		nvmm_vcpu_free(mach, vcpu);
400 		nvmm_vcpu_put(vcpu);
401 		goto out;
402 	}
403 	error = uvm_map_pageable(kernel_map, (vaddr_t)vcpu->comm,
404 	    (vaddr_t)vcpu->comm + PAGE_SIZE, false, 0);
405 	if (error) {
406 		nvmm_vcpu_free(mach, vcpu);
407 		nvmm_vcpu_put(vcpu);
408 		goto out;
409 	}
410 	memset(vcpu->comm, 0, PAGE_SIZE);
411 
412 	error = (*nvmm_impl->vcpu_create)(mach, vcpu);
413 	if (error) {
414 		nvmm_vcpu_free(mach, vcpu);
415 		nvmm_vcpu_put(vcpu);
416 		goto out;
417 	}
418 
419 	nvmm_vcpu_put(vcpu);
420 	atomic_inc_uint(&mach->ncpus);
421 
422 out:
423 	nvmm_machine_put(mach);
424 	return error;
425 }
426 
427 static int
428 nvmm_vcpu_destroy(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_destroy *args)
429 {
430 	struct nvmm_machine *mach;
431 	struct nvmm_cpu *vcpu;
432 	int error;
433 
434 	error = nvmm_machine_get(owner, args->machid, &mach, false);
435 	if (error)
436 		return error;
437 
438 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
439 	if (error)
440 		goto out;
441 
442 	(*nvmm_impl->vcpu_destroy)(mach, vcpu);
443 	nvmm_vcpu_free(mach, vcpu);
444 	nvmm_vcpu_put(vcpu);
445 	atomic_dec_uint(&mach->ncpus);
446 
447 out:
448 	nvmm_machine_put(mach);
449 	return error;
450 }
451 
452 static int
453 nvmm_vcpu_configure(struct nvmm_owner *owner,
454     struct nvmm_ioc_vcpu_configure *args)
455 {
456 	struct nvmm_machine *mach;
457 	struct nvmm_cpu *vcpu;
458 	size_t allocsz;
459 	uint64_t op;
460 	void *data;
461 	int error;
462 
463 	op = NVMM_VCPU_CONF_MD(args->op);
464 	if (__predict_false(op >= nvmm_impl->vcpu_conf_max))
465 		return EINVAL;
466 
467 	allocsz = nvmm_impl->vcpu_conf_sizes[op];
468 	data = kmem_alloc(allocsz, KM_SLEEP);
469 
470 	error = nvmm_machine_get(owner, args->machid, &mach, false);
471 	if (error) {
472 		kmem_free(data, allocsz);
473 		return error;
474 	}
475 
476 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
477 	if (error) {
478 		nvmm_machine_put(mach);
479 		kmem_free(data, allocsz);
480 		return error;
481 	}
482 
483 	error = copyin(args->conf, data, allocsz);
484 	if (error) {
485 		goto out;
486 	}
487 
488 	error = (*nvmm_impl->vcpu_configure)(vcpu, op, data);
489 
490 out:
491 	nvmm_vcpu_put(vcpu);
492 	nvmm_machine_put(mach);
493 	kmem_free(data, allocsz);
494 	return error;
495 }
496 
497 static int
498 nvmm_vcpu_setstate(struct nvmm_owner *owner,
499     struct nvmm_ioc_vcpu_setstate *args)
500 {
501 	struct nvmm_machine *mach;
502 	struct nvmm_cpu *vcpu;
503 	int error;
504 
505 	error = nvmm_machine_get(owner, args->machid, &mach, false);
506 	if (error)
507 		return error;
508 
509 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
510 	if (error)
511 		goto out;
512 
513 	(*nvmm_impl->vcpu_setstate)(vcpu);
514 	nvmm_vcpu_put(vcpu);
515 
516 out:
517 	nvmm_machine_put(mach);
518 	return error;
519 }
520 
521 static int
522 nvmm_vcpu_getstate(struct nvmm_owner *owner,
523     struct nvmm_ioc_vcpu_getstate *args)
524 {
525 	struct nvmm_machine *mach;
526 	struct nvmm_cpu *vcpu;
527 	int error;
528 
529 	error = nvmm_machine_get(owner, args->machid, &mach, false);
530 	if (error)
531 		return error;
532 
533 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
534 	if (error)
535 		goto out;
536 
537 	(*nvmm_impl->vcpu_getstate)(vcpu);
538 	nvmm_vcpu_put(vcpu);
539 
540 out:
541 	nvmm_machine_put(mach);
542 	return error;
543 }
544 
545 static int
546 nvmm_vcpu_inject(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_inject *args)
547 {
548 	struct nvmm_machine *mach;
549 	struct nvmm_cpu *vcpu;
550 	int error;
551 
552 	error = nvmm_machine_get(owner, args->machid, &mach, false);
553 	if (error)
554 		return error;
555 
556 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
557 	if (error)
558 		goto out;
559 
560 	error = (*nvmm_impl->vcpu_inject)(vcpu);
561 	nvmm_vcpu_put(vcpu);
562 
563 out:
564 	nvmm_machine_put(mach);
565 	return error;
566 }
567 
568 static int
569 nvmm_do_vcpu_run(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
570     struct nvmm_vcpu_exit *exit)
571 {
572 	struct vmspace *vm = mach->vm;
573 	int ret;
574 
575 	while (1) {
576 		/* Got a signal? Or pending resched? Leave. */
577 		if (__predict_false(nvmm_return_needed())) {
578 			exit->reason = NVMM_VCPU_EXIT_NONE;
579 			return 0;
580 		}
581 
582 		/* Run the VCPU. */
583 		ret = (*nvmm_impl->vcpu_run)(mach, vcpu, exit);
584 		if (__predict_false(ret != 0)) {
585 			return ret;
586 		}
587 
588 		/* Process nested page faults. */
589 		if (__predict_true(exit->reason != NVMM_VCPU_EXIT_MEMORY)) {
590 			break;
591 		}
592 		if (exit->u.mem.gpa >= mach->gpa_end) {
593 			break;
594 		}
595 		if (uvm_fault(&vm->vm_map, exit->u.mem.gpa, exit->u.mem.prot)) {
596 			break;
597 		}
598 	}
599 
600 	return 0;
601 }
602 
603 static int
604 nvmm_vcpu_run(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_run *args)
605 {
606 	struct nvmm_machine *mach;
607 	struct nvmm_cpu *vcpu;
608 	int error;
609 
610 	error = nvmm_machine_get(owner, args->machid, &mach, false);
611 	if (error)
612 		return error;
613 
614 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
615 	if (error)
616 		goto out;
617 
618 	error = nvmm_do_vcpu_run(mach, vcpu, &args->exit);
619 	nvmm_vcpu_put(vcpu);
620 
621 out:
622 	nvmm_machine_put(mach);
623 	return error;
624 }
625 
626 /* -------------------------------------------------------------------------- */
627 
628 static struct uvm_object *
629 nvmm_hmapping_getuobj(struct nvmm_machine *mach, uintptr_t hva, size_t size,
630    size_t *off)
631 {
632 	struct nvmm_hmapping *hmapping;
633 	size_t i;
634 
635 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
636 		hmapping = &mach->hmap[i];
637 		if (!hmapping->present) {
638 			continue;
639 		}
640 		if (hva >= hmapping->hva &&
641 		    hva + size <= hmapping->hva + hmapping->size) {
642 			*off = hva - hmapping->hva;
643 			return hmapping->uobj;
644 		}
645 	}
646 
647 	return NULL;
648 }
649 
650 static int
651 nvmm_hmapping_validate(struct nvmm_machine *mach, uintptr_t hva, size_t size)
652 {
653 	struct nvmm_hmapping *hmapping;
654 	size_t i;
655 
656 	if ((hva % PAGE_SIZE) != 0 || (size % PAGE_SIZE) != 0) {
657 		return EINVAL;
658 	}
659 	if (hva == 0) {
660 		return EINVAL;
661 	}
662 
663 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
664 		hmapping = &mach->hmap[i];
665 		if (!hmapping->present) {
666 			continue;
667 		}
668 
669 		if (hva >= hmapping->hva &&
670 		    hva + size <= hmapping->hva + hmapping->size) {
671 			break;
672 		}
673 
674 		if (hva >= hmapping->hva &&
675 		    hva < hmapping->hva + hmapping->size) {
676 			return EEXIST;
677 		}
678 		if (hva + size > hmapping->hva &&
679 		    hva + size <= hmapping->hva + hmapping->size) {
680 			return EEXIST;
681 		}
682 		if (hva <= hmapping->hva &&
683 		    hva + size >= hmapping->hva + hmapping->size) {
684 			return EEXIST;
685 		}
686 	}
687 
688 	return 0;
689 }
690 
691 static struct nvmm_hmapping *
692 nvmm_hmapping_alloc(struct nvmm_machine *mach)
693 {
694 	struct nvmm_hmapping *hmapping;
695 	size_t i;
696 
697 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
698 		hmapping = &mach->hmap[i];
699 		if (!hmapping->present) {
700 			hmapping->present = true;
701 			return hmapping;
702 		}
703 	}
704 
705 	return NULL;
706 }
707 
708 static int
709 nvmm_hmapping_free(struct nvmm_machine *mach, uintptr_t hva, size_t size)
710 {
711 	struct vmspace *vmspace = curproc->p_vmspace;
712 	struct nvmm_hmapping *hmapping;
713 	size_t i;
714 
715 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
716 		hmapping = &mach->hmap[i];
717 		if (!hmapping->present || hmapping->hva != hva ||
718 		    hmapping->size != size) {
719 			continue;
720 		}
721 
722 		uvm_unmap(&vmspace->vm_map, hmapping->hva,
723 		    hmapping->hva + hmapping->size);
724 		uao_detach(hmapping->uobj);
725 
726 		hmapping->uobj = NULL;
727 		hmapping->present = false;
728 
729 		return 0;
730 	}
731 
732 	return ENOENT;
733 }
734 
735 static int
736 nvmm_hva_map(struct nvmm_owner *owner, struct nvmm_ioc_hva_map *args)
737 {
738 	struct vmspace *vmspace = curproc->p_vmspace;
739 	struct nvmm_machine *mach;
740 	struct nvmm_hmapping *hmapping;
741 	vaddr_t uva;
742 	int error;
743 
744 	error = nvmm_machine_get(owner, args->machid, &mach, true);
745 	if (error)
746 		return error;
747 
748 	error = nvmm_hmapping_validate(mach, args->hva, args->size);
749 	if (error)
750 		goto out;
751 
752 	hmapping = nvmm_hmapping_alloc(mach);
753 	if (hmapping == NULL) {
754 		error = ENOBUFS;
755 		goto out;
756 	}
757 
758 	hmapping->hva = args->hva;
759 	hmapping->size = args->size;
760 	hmapping->uobj = uao_create(hmapping->size, 0);
761 	uva = hmapping->hva;
762 
763 	/* Take a reference for the user. */
764 	uao_reference(hmapping->uobj);
765 
766 	/* Map the uobj into the user address space, as pageable. */
767 	error = uvm_map(&vmspace->vm_map, &uva, hmapping->size, hmapping->uobj,
768 	    0, 0, UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, UVM_INH_SHARE,
769 	    UVM_ADV_RANDOM, UVM_FLAG_FIXED|UVM_FLAG_UNMAP));
770 	if (error) {
771 		uao_detach(hmapping->uobj);
772 	}
773 
774 out:
775 	nvmm_machine_put(mach);
776 	return error;
777 }
778 
779 static int
780 nvmm_hva_unmap(struct nvmm_owner *owner, struct nvmm_ioc_hva_unmap *args)
781 {
782 	struct nvmm_machine *mach;
783 	int error;
784 
785 	error = nvmm_machine_get(owner, args->machid, &mach, true);
786 	if (error)
787 		return error;
788 
789 	error = nvmm_hmapping_free(mach, args->hva, args->size);
790 
791 	nvmm_machine_put(mach);
792 	return error;
793 }
794 
795 /* -------------------------------------------------------------------------- */
796 
797 static int
798 nvmm_gpa_map(struct nvmm_owner *owner, struct nvmm_ioc_gpa_map *args)
799 {
800 	struct nvmm_machine *mach;
801 	struct uvm_object *uobj;
802 	gpaddr_t gpa;
803 	size_t off;
804 	int error;
805 
806 	error = nvmm_machine_get(owner, args->machid, &mach, false);
807 	if (error)
808 		return error;
809 
810 	if ((args->prot & ~(PROT_READ|PROT_WRITE|PROT_EXEC)) != 0) {
811 		error = EINVAL;
812 		goto out;
813 	}
814 
815 	if ((args->gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0 ||
816 	    (args->hva % PAGE_SIZE) != 0) {
817 		error = EINVAL;
818 		goto out;
819 	}
820 	if (args->hva == 0) {
821 		error = EINVAL;
822 		goto out;
823 	}
824 	if (args->gpa < mach->gpa_begin || args->gpa >= mach->gpa_end) {
825 		error = EINVAL;
826 		goto out;
827 	}
828 	if (args->gpa + args->size <= args->gpa) {
829 		error = EINVAL;
830 		goto out;
831 	}
832 	if (args->gpa + args->size > mach->gpa_end) {
833 		error = EINVAL;
834 		goto out;
835 	}
836 	gpa = args->gpa;
837 
838 	uobj = nvmm_hmapping_getuobj(mach, args->hva, args->size, &off);
839 	if (uobj == NULL) {
840 		error = EINVAL;
841 		goto out;
842 	}
843 
844 	/* Take a reference for the machine. */
845 	uao_reference(uobj);
846 
847 	/* Map the uobj into the machine address space, as pageable. */
848 	error = uvm_map(&mach->vm->vm_map, &gpa, args->size, uobj, off, 0,
849 	    UVM_MAPFLAG(args->prot, UVM_PROT_RWX, UVM_INH_NONE,
850 	    UVM_ADV_RANDOM, UVM_FLAG_FIXED|UVM_FLAG_UNMAP));
851 	if (error) {
852 		uao_detach(uobj);
853 		goto out;
854 	}
855 	if (gpa != args->gpa) {
856 		uao_detach(uobj);
857 		printf("[!] uvm_map problem\n");
858 		error = EINVAL;
859 		goto out;
860 	}
861 
862 out:
863 	nvmm_machine_put(mach);
864 	return error;
865 }
866 
867 static int
868 nvmm_gpa_unmap(struct nvmm_owner *owner, struct nvmm_ioc_gpa_unmap *args)
869 {
870 	struct nvmm_machine *mach;
871 	gpaddr_t gpa;
872 	int error;
873 
874 	error = nvmm_machine_get(owner, args->machid, &mach, false);
875 	if (error)
876 		return error;
877 
878 	if ((args->gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0) {
879 		error = EINVAL;
880 		goto out;
881 	}
882 	if (args->gpa < mach->gpa_begin || args->gpa >= mach->gpa_end) {
883 		error = EINVAL;
884 		goto out;
885 	}
886 	if (args->gpa + args->size <= args->gpa) {
887 		error = EINVAL;
888 		goto out;
889 	}
890 	if (args->gpa + args->size >= mach->gpa_end) {
891 		error = EINVAL;
892 		goto out;
893 	}
894 	gpa = args->gpa;
895 
896 	/* Unmap the memory from the machine. */
897 	uvm_unmap(&mach->vm->vm_map, gpa, gpa + args->size);
898 
899 out:
900 	nvmm_machine_put(mach);
901 	return error;
902 }
903 
904 /* -------------------------------------------------------------------------- */
905 
906 static int
907 nvmm_ctl_mach_info(struct nvmm_owner *owner, struct nvmm_ioc_ctl *args)
908 {
909 	struct nvmm_ctl_mach_info ctl;
910 	struct nvmm_machine *mach;
911 	int error;
912 	size_t i;
913 
914 	if (args->size != sizeof(ctl))
915 		return EINVAL;
916 	error = copyin(args->data, &ctl, sizeof(ctl));
917 	if (error)
918 		return error;
919 
920 	error = nvmm_machine_get(owner, ctl.machid, &mach, true);
921 	if (error)
922 		return error;
923 
924 	ctl.nvcpus = mach->ncpus;
925 
926 	ctl.nram = 0;
927 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
928 		if (!mach->hmap[i].present)
929 			continue;
930 		ctl.nram += mach->hmap[i].size;
931 	}
932 
933 	ctl.pid = mach->owner->pid;
934 	ctl.time = mach->time;
935 
936 	nvmm_machine_put(mach);
937 
938 	error = copyout(&ctl, args->data, sizeof(ctl));
939 	if (error)
940 		return error;
941 
942 	return 0;
943 }
944 
945 static int
946 nvmm_ctl(struct nvmm_owner *owner, struct nvmm_ioc_ctl *args)
947 {
948 	switch (args->op) {
949 	case NVMM_CTL_MACH_INFO:
950 		return nvmm_ctl_mach_info(owner, args);
951 	default:
952 		return EINVAL;
953 	}
954 }
955 
956 /* -------------------------------------------------------------------------- */
957 
958 static const struct nvmm_impl *
959 nvmm_ident(void)
960 {
961 	size_t i;
962 
963 	for (i = 0; i < __arraycount(nvmm_impl_list); i++) {
964 		if ((*nvmm_impl_list[i]->ident)())
965 			return nvmm_impl_list[i];
966 	}
967 
968 	return NULL;
969 }
970 
971 static int
972 nvmm_init(void)
973 {
974 	size_t i, n;
975 
976 	nvmm_impl = nvmm_ident();
977 	if (nvmm_impl == NULL)
978 		return ENOTSUP;
979 
980 	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
981 		machines[i].machid = i;
982 		rw_init(&machines[i].lock);
983 		for (n = 0; n < NVMM_MAX_VCPUS; n++) {
984 			machines[i].cpus[n].present = false;
985 			machines[i].cpus[n].cpuid = n;
986 			mutex_init(&machines[i].cpus[n].lock, MUTEX_DEFAULT,
987 			    IPL_NONE);
988 		}
989 	}
990 
991 	(*nvmm_impl->init)();
992 
993 	return 0;
994 }
995 
996 static void
997 nvmm_fini(void)
998 {
999 	size_t i, n;
1000 
1001 	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
1002 		rw_destroy(&machines[i].lock);
1003 		for (n = 0; n < NVMM_MAX_VCPUS; n++) {
1004 			mutex_destroy(&machines[i].cpus[n].lock);
1005 		}
1006 	}
1007 
1008 	(*nvmm_impl->fini)();
1009 	nvmm_impl = NULL;
1010 }
1011 
1012 /* -------------------------------------------------------------------------- */
1013 
1014 static dev_type_open(nvmm_open);
1015 
1016 const struct cdevsw nvmm_cdevsw = {
1017 	.d_open = nvmm_open,
1018 	.d_close = noclose,
1019 	.d_read = noread,
1020 	.d_write = nowrite,
1021 	.d_ioctl = noioctl,
1022 	.d_stop = nostop,
1023 	.d_tty = notty,
1024 	.d_poll = nopoll,
1025 	.d_mmap = nommap,
1026 	.d_kqfilter = nokqfilter,
1027 	.d_discard = nodiscard,
1028 	.d_flag = D_OTHER | D_MPSAFE
1029 };
1030 
1031 static int nvmm_ioctl(file_t *, u_long, void *);
1032 static int nvmm_close(file_t *);
1033 static int nvmm_mmap(file_t *, off_t *, size_t, int, int *, int *,
1034     struct uvm_object **, int *);
1035 
1036 static const struct fileops nvmm_fileops = {
1037 	.fo_read = fbadop_read,
1038 	.fo_write = fbadop_write,
1039 	.fo_ioctl = nvmm_ioctl,
1040 	.fo_fcntl = fnullop_fcntl,
1041 	.fo_poll = fnullop_poll,
1042 	.fo_stat = fbadop_stat,
1043 	.fo_close = nvmm_close,
1044 	.fo_kqfilter = fnullop_kqfilter,
1045 	.fo_restart = fnullop_restart,
1046 	.fo_mmap = nvmm_mmap,
1047 };
1048 
1049 static int
1050 nvmm_open(dev_t dev, int flags, int type, struct lwp *l)
1051 {
1052 	struct nvmm_owner *owner;
1053 	struct file *fp;
1054 	int error, fd;
1055 
1056 	if (__predict_false(nvmm_impl == NULL))
1057 		return ENXIO;
1058 	if (minor(dev) != 0)
1059 		return EXDEV;
1060 	if (!(flags & O_CLOEXEC))
1061 		return EINVAL;
1062 	error = fd_allocfile(&fp, &fd);
1063 	if (error)
1064 		return error;
1065 
1066 	if (OFLAGS(flags) & O_WRONLY) {
1067 		owner = &root_owner;
1068 	} else {
1069 		owner = kmem_alloc(sizeof(*owner), KM_SLEEP);
1070 		owner->pid = l->l_proc->p_pid;
1071 	}
1072 
1073 	return fd_clone(fp, fd, flags, &nvmm_fileops, owner);
1074 }
1075 
1076 static int
1077 nvmm_close(file_t *fp)
1078 {
1079 	struct nvmm_owner *owner = fp->f_data;
1080 
1081 	KASSERT(owner != NULL);
1082 	nvmm_kill_machines(owner);
1083 	if (owner != &root_owner) {
1084 		kmem_free(owner, sizeof(*owner));
1085 	}
1086 	fp->f_data = NULL;
1087 
1088 	return 0;
1089 }
1090 
1091 static int
1092 nvmm_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp,
1093     int *advicep, struct uvm_object **uobjp, int *maxprotp)
1094 {
1095 	struct nvmm_owner *owner = fp->f_data;
1096 	struct nvmm_machine *mach;
1097 	nvmm_machid_t machid;
1098 	nvmm_cpuid_t cpuid;
1099 	int error;
1100 
1101 	if (prot & PROT_EXEC)
1102 		return EACCES;
1103 	if (size != PAGE_SIZE)
1104 		return EINVAL;
1105 
1106 	cpuid = NVMM_COMM_CPUID(*offp);
1107 	if (__predict_false(cpuid >= NVMM_MAX_VCPUS))
1108 		return EINVAL;
1109 
1110 	machid = NVMM_COMM_MACHID(*offp);
1111 	error = nvmm_machine_get(owner, machid, &mach, false);
1112 	if (error)
1113 		return error;
1114 
1115 	uao_reference(mach->commuobj);
1116 	*uobjp = mach->commuobj;
1117 	*offp = cpuid * PAGE_SIZE;
1118 	*maxprotp = prot;
1119 	*advicep = UVM_ADV_RANDOM;
1120 
1121 	nvmm_machine_put(mach);
1122 	return 0;
1123 }
1124 
1125 static int
1126 nvmm_ioctl(file_t *fp, u_long cmd, void *data)
1127 {
1128 	struct nvmm_owner *owner = fp->f_data;
1129 
1130 	KASSERT(owner != NULL);
1131 
1132 	switch (cmd) {
1133 	case NVMM_IOC_CAPABILITY:
1134 		return nvmm_capability(owner, data);
1135 	case NVMM_IOC_MACHINE_CREATE:
1136 		return nvmm_machine_create(owner, data);
1137 	case NVMM_IOC_MACHINE_DESTROY:
1138 		return nvmm_machine_destroy(owner, data);
1139 	case NVMM_IOC_MACHINE_CONFIGURE:
1140 		return nvmm_machine_configure(owner, data);
1141 	case NVMM_IOC_VCPU_CREATE:
1142 		return nvmm_vcpu_create(owner, data);
1143 	case NVMM_IOC_VCPU_DESTROY:
1144 		return nvmm_vcpu_destroy(owner, data);
1145 	case NVMM_IOC_VCPU_CONFIGURE:
1146 		return nvmm_vcpu_configure(owner, data);
1147 	case NVMM_IOC_VCPU_SETSTATE:
1148 		return nvmm_vcpu_setstate(owner, data);
1149 	case NVMM_IOC_VCPU_GETSTATE:
1150 		return nvmm_vcpu_getstate(owner, data);
1151 	case NVMM_IOC_VCPU_INJECT:
1152 		return nvmm_vcpu_inject(owner, data);
1153 	case NVMM_IOC_VCPU_RUN:
1154 		return nvmm_vcpu_run(owner, data);
1155 	case NVMM_IOC_GPA_MAP:
1156 		return nvmm_gpa_map(owner, data);
1157 	case NVMM_IOC_GPA_UNMAP:
1158 		return nvmm_gpa_unmap(owner, data);
1159 	case NVMM_IOC_HVA_MAP:
1160 		return nvmm_hva_map(owner, data);
1161 	case NVMM_IOC_HVA_UNMAP:
1162 		return nvmm_hva_unmap(owner, data);
1163 	case NVMM_IOC_CTL:
1164 		return nvmm_ctl(owner, data);
1165 	default:
1166 		return EINVAL;
1167 	}
1168 }
1169 
1170 /* -------------------------------------------------------------------------- */
1171 
1172 static int nvmm_match(device_t, cfdata_t, void *);
1173 static void nvmm_attach(device_t, device_t, void *);
1174 static int nvmm_detach(device_t, int);
1175 
1176 extern struct cfdriver nvmm_cd;
1177 
1178 CFATTACH_DECL_NEW(nvmm, 0, nvmm_match, nvmm_attach, nvmm_detach, NULL);
1179 
1180 static struct cfdata nvmm_cfdata[] = {
1181 	{
1182 		.cf_name = "nvmm",
1183 		.cf_atname = "nvmm",
1184 		.cf_unit = 0,
1185 		.cf_fstate = FSTATE_STAR,
1186 		.cf_loc = NULL,
1187 		.cf_flags = 0,
1188 		.cf_pspec = NULL,
1189 	},
1190 	{ NULL, NULL, 0, FSTATE_NOTFOUND, NULL, 0, NULL }
1191 };
1192 
1193 static int
1194 nvmm_match(device_t self, cfdata_t cfdata, void *arg)
1195 {
1196 	return 1;
1197 }
1198 
1199 static void
1200 nvmm_attach(device_t parent, device_t self, void *aux)
1201 {
1202 	int error;
1203 
1204 	error = nvmm_init();
1205 	if (error)
1206 		panic("%s: impossible", __func__);
1207 	aprint_normal_dev(self, "attached, using backend %s\n",
1208 	    nvmm_impl->name);
1209 }
1210 
1211 static int
1212 nvmm_detach(device_t self, int flags)
1213 {
1214 	if (atomic_load_relaxed(&nmachines) > 0)
1215 		return EBUSY;
1216 	nvmm_fini();
1217 	return 0;
1218 }
1219 
1220 void
1221 nvmmattach(int nunits)
1222 {
1223 	/* nothing */
1224 }
1225 
1226 MODULE(MODULE_CLASS_MISC, nvmm, NULL);
1227 
1228 #if defined(_MODULE)
1229 CFDRIVER_DECL(nvmm, DV_VIRTUAL, NULL);
1230 #endif
1231 
1232 static int
1233 nvmm_modcmd(modcmd_t cmd, void *arg)
1234 {
1235 #if defined(_MODULE)
1236 	devmajor_t bmajor = NODEVMAJOR;
1237 	devmajor_t cmajor = 345;
1238 #endif
1239 	int error;
1240 
1241 	switch (cmd) {
1242 	case MODULE_CMD_INIT:
1243 		if (nvmm_ident() == NULL) {
1244 			aprint_error("%s: cpu not supported\n",
1245 			    nvmm_cd.cd_name);
1246 			return ENOTSUP;
1247 		}
1248 #if defined(_MODULE)
1249 		error = config_cfdriver_attach(&nvmm_cd);
1250 		if (error)
1251 			return error;
1252 #endif
1253 		error = config_cfattach_attach(nvmm_cd.cd_name, &nvmm_ca);
1254 		if (error) {
1255 			config_cfdriver_detach(&nvmm_cd);
1256 			aprint_error("%s: config_cfattach_attach failed\n",
1257 			    nvmm_cd.cd_name);
1258 			return error;
1259 		}
1260 
1261 		error = config_cfdata_attach(nvmm_cfdata, 1);
1262 		if (error) {
1263 			config_cfattach_detach(nvmm_cd.cd_name, &nvmm_ca);
1264 			config_cfdriver_detach(&nvmm_cd);
1265 			aprint_error("%s: unable to register cfdata\n",
1266 			    nvmm_cd.cd_name);
1267 			return error;
1268 		}
1269 
1270 		if (config_attach_pseudo(nvmm_cfdata) == NULL) {
1271 			aprint_error("%s: config_attach_pseudo failed\n",
1272 			    nvmm_cd.cd_name);
1273 			config_cfattach_detach(nvmm_cd.cd_name, &nvmm_ca);
1274 			config_cfdriver_detach(&nvmm_cd);
1275 			return ENXIO;
1276 		}
1277 
1278 #if defined(_MODULE)
1279 		/* mknod /dev/nvmm c 345 0 */
1280 		error = devsw_attach(nvmm_cd.cd_name, NULL, &bmajor,
1281 			&nvmm_cdevsw, &cmajor);
1282 		if (error) {
1283 			aprint_error("%s: unable to register devsw\n",
1284 			    nvmm_cd.cd_name);
1285 			config_cfattach_detach(nvmm_cd.cd_name, &nvmm_ca);
1286 			config_cfdriver_detach(&nvmm_cd);
1287 			return error;
1288 		}
1289 #endif
1290 		return 0;
1291 	case MODULE_CMD_FINI:
1292 		error = config_cfdata_detach(nvmm_cfdata);
1293 		if (error)
1294 			return error;
1295 		error = config_cfattach_detach(nvmm_cd.cd_name, &nvmm_ca);
1296 		if (error)
1297 			return error;
1298 #if defined(_MODULE)
1299 		config_cfdriver_detach(&nvmm_cd);
1300 		devsw_detach(NULL, &nvmm_cdevsw);
1301 #endif
1302 		return 0;
1303 	case MODULE_CMD_AUTOUNLOAD:
1304 		return EBUSY;
1305 	default:
1306 		return ENOTTY;
1307 	}
1308 }
1309