xref: /netbsd-src/sys/dev/nvmm/nvmm.c (revision d90047b5d07facf36e6c01dcc0bded8997ce9cc2)
1 /*	$NetBSD: nvmm.c,v 1.32 2020/07/03 16:09:54 maxv Exp $	*/
2 
3 /*
4  * Copyright (c) 2018-2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.32 2020/07/03 16:09:54 maxv Exp $");
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 
39 #include <sys/cpu.h>
40 #include <sys/conf.h>
41 #include <sys/kmem.h>
42 #include <sys/module.h>
43 #include <sys/proc.h>
44 #include <sys/mman.h>
45 #include <sys/file.h>
46 #include <sys/filedesc.h>
47 #include <sys/device.h>
48 
49 #include <uvm/uvm.h>
50 #include <uvm/uvm_page.h>
51 
52 #include "ioconf.h"
53 
54 #include <dev/nvmm/nvmm.h>
55 #include <dev/nvmm/nvmm_internal.h>
56 #include <dev/nvmm/nvmm_ioctl.h>
57 
58 static struct nvmm_machine machines[NVMM_MAX_MACHINES];
59 static volatile unsigned int nmachines __cacheline_aligned;
60 
61 static const struct nvmm_impl *nvmm_impl_list[] = {
62 	&nvmm_x86_svm,	/* x86 AMD SVM */
63 	&nvmm_x86_vmx	/* x86 Intel VMX */
64 };
65 
66 static const struct nvmm_impl *nvmm_impl = NULL;
67 
68 static struct nvmm_owner root_owner;
69 
70 /* -------------------------------------------------------------------------- */
71 
72 static int
73 nvmm_machine_alloc(struct nvmm_machine **ret)
74 {
75 	struct nvmm_machine *mach;
76 	size_t i;
77 
78 	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
79 		mach = &machines[i];
80 
81 		rw_enter(&mach->lock, RW_WRITER);
82 		if (mach->present) {
83 			rw_exit(&mach->lock);
84 			continue;
85 		}
86 
87 		mach->present = true;
88 		mach->time = time_second;
89 		*ret = mach;
90 		atomic_inc_uint(&nmachines);
91 		return 0;
92 	}
93 
94 	return ENOBUFS;
95 }
96 
97 static void
98 nvmm_machine_free(struct nvmm_machine *mach)
99 {
100 	KASSERT(rw_write_held(&mach->lock));
101 	KASSERT(mach->present);
102 	mach->present = false;
103 	atomic_dec_uint(&nmachines);
104 }
105 
106 static int
107 nvmm_machine_get(struct nvmm_owner *owner, nvmm_machid_t machid,
108     struct nvmm_machine **ret, bool writer)
109 {
110 	struct nvmm_machine *mach;
111 	krw_t op = writer ? RW_WRITER : RW_READER;
112 
113 	if (machid >= NVMM_MAX_MACHINES) {
114 		return EINVAL;
115 	}
116 	mach = &machines[machid];
117 
118 	rw_enter(&mach->lock, op);
119 	if (!mach->present) {
120 		rw_exit(&mach->lock);
121 		return ENOENT;
122 	}
123 	if (owner != &root_owner && mach->owner != owner) {
124 		rw_exit(&mach->lock);
125 		return EPERM;
126 	}
127 	*ret = mach;
128 
129 	return 0;
130 }
131 
132 static void
133 nvmm_machine_put(struct nvmm_machine *mach)
134 {
135 	rw_exit(&mach->lock);
136 }
137 
138 /* -------------------------------------------------------------------------- */
139 
140 static int
141 nvmm_vcpu_alloc(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
142     struct nvmm_cpu **ret)
143 {
144 	struct nvmm_cpu *vcpu;
145 
146 	if (cpuid >= NVMM_MAX_VCPUS) {
147 		return EINVAL;
148 	}
149 	vcpu = &mach->cpus[cpuid];
150 
151 	mutex_enter(&vcpu->lock);
152 	if (vcpu->present) {
153 		mutex_exit(&vcpu->lock);
154 		return EBUSY;
155 	}
156 
157 	vcpu->present = true;
158 	vcpu->comm = NULL;
159 	vcpu->hcpu_last = -1;
160 	*ret = vcpu;
161 	return 0;
162 }
163 
164 static void
165 nvmm_vcpu_free(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
166 {
167 	KASSERT(mutex_owned(&vcpu->lock));
168 	vcpu->present = false;
169 	if (vcpu->comm != NULL) {
170 		uvm_deallocate(kernel_map, (vaddr_t)vcpu->comm, PAGE_SIZE);
171 	}
172 }
173 
174 static int
175 nvmm_vcpu_get(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
176     struct nvmm_cpu **ret)
177 {
178 	struct nvmm_cpu *vcpu;
179 
180 	if (cpuid >= NVMM_MAX_VCPUS) {
181 		return EINVAL;
182 	}
183 	vcpu = &mach->cpus[cpuid];
184 
185 	mutex_enter(&vcpu->lock);
186 	if (!vcpu->present) {
187 		mutex_exit(&vcpu->lock);
188 		return ENOENT;
189 	}
190 	*ret = vcpu;
191 
192 	return 0;
193 }
194 
195 static void
196 nvmm_vcpu_put(struct nvmm_cpu *vcpu)
197 {
198 	mutex_exit(&vcpu->lock);
199 }
200 
201 /* -------------------------------------------------------------------------- */
202 
203 static void
204 nvmm_kill_machines(struct nvmm_owner *owner)
205 {
206 	struct nvmm_machine *mach;
207 	struct nvmm_cpu *vcpu;
208 	size_t i, j;
209 	int error;
210 
211 	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
212 		mach = &machines[i];
213 
214 		rw_enter(&mach->lock, RW_WRITER);
215 		if (!mach->present || mach->owner != owner) {
216 			rw_exit(&mach->lock);
217 			continue;
218 		}
219 
220 		/* Kill it. */
221 		for (j = 0; j < NVMM_MAX_VCPUS; j++) {
222 			error = nvmm_vcpu_get(mach, j, &vcpu);
223 			if (error)
224 				continue;
225 			(*nvmm_impl->vcpu_destroy)(mach, vcpu);
226 			nvmm_vcpu_free(mach, vcpu);
227 			nvmm_vcpu_put(vcpu);
228 		}
229 		(*nvmm_impl->machine_destroy)(mach);
230 		uvmspace_free(mach->vm);
231 
232 		/* Drop the kernel UOBJ refs. */
233 		for (j = 0; j < NVMM_MAX_HMAPPINGS; j++) {
234 			if (!mach->hmap[j].present)
235 				continue;
236 			uao_detach(mach->hmap[j].uobj);
237 		}
238 
239 		nvmm_machine_free(mach);
240 
241 		rw_exit(&mach->lock);
242 	}
243 }
244 
245 /* -------------------------------------------------------------------------- */
246 
247 static int
248 nvmm_capability(struct nvmm_owner *owner, struct nvmm_ioc_capability *args)
249 {
250 	args->cap.version = NVMM_KERN_VERSION;
251 	args->cap.state_size = nvmm_impl->state_size;
252 	args->cap.max_machines = NVMM_MAX_MACHINES;
253 	args->cap.max_vcpus = NVMM_MAX_VCPUS;
254 	args->cap.max_ram = NVMM_MAX_RAM;
255 
256 	(*nvmm_impl->capability)(&args->cap);
257 
258 	return 0;
259 }
260 
261 static int
262 nvmm_machine_create(struct nvmm_owner *owner,
263     struct nvmm_ioc_machine_create *args)
264 {
265 	struct nvmm_machine *mach;
266 	int error;
267 
268 	error = nvmm_machine_alloc(&mach);
269 	if (error)
270 		return error;
271 
272 	/* Curproc owns the machine. */
273 	mach->owner = owner;
274 
275 	/* Zero out the host mappings. */
276 	memset(&mach->hmap, 0, sizeof(mach->hmap));
277 
278 	/* Create the machine vmspace. */
279 	mach->gpa_begin = 0;
280 	mach->gpa_end = NVMM_MAX_RAM;
281 	mach->vm = uvmspace_alloc(0, mach->gpa_end - mach->gpa_begin, false);
282 
283 	/* Create the comm uobj. */
284 	mach->commuobj = uao_create(NVMM_MAX_VCPUS * PAGE_SIZE, 0);
285 
286 	(*nvmm_impl->machine_create)(mach);
287 
288 	args->machid = mach->machid;
289 	nvmm_machine_put(mach);
290 
291 	return 0;
292 }
293 
294 static int
295 nvmm_machine_destroy(struct nvmm_owner *owner,
296     struct nvmm_ioc_machine_destroy *args)
297 {
298 	struct nvmm_machine *mach;
299 	struct nvmm_cpu *vcpu;
300 	int error;
301 	size_t i;
302 
303 	error = nvmm_machine_get(owner, args->machid, &mach, true);
304 	if (error)
305 		return error;
306 
307 	for (i = 0; i < NVMM_MAX_VCPUS; i++) {
308 		error = nvmm_vcpu_get(mach, i, &vcpu);
309 		if (error)
310 			continue;
311 
312 		(*nvmm_impl->vcpu_destroy)(mach, vcpu);
313 		nvmm_vcpu_free(mach, vcpu);
314 		nvmm_vcpu_put(vcpu);
315 	}
316 
317 	(*nvmm_impl->machine_destroy)(mach);
318 
319 	/* Free the machine vmspace. */
320 	uvmspace_free(mach->vm);
321 
322 	/* Drop the kernel UOBJ refs. */
323 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
324 		if (!mach->hmap[i].present)
325 			continue;
326 		uao_detach(mach->hmap[i].uobj);
327 	}
328 
329 	nvmm_machine_free(mach);
330 	nvmm_machine_put(mach);
331 
332 	return 0;
333 }
334 
335 static int
336 nvmm_machine_configure(struct nvmm_owner *owner,
337     struct nvmm_ioc_machine_configure *args)
338 {
339 	struct nvmm_machine *mach;
340 	size_t allocsz;
341 	uint64_t op;
342 	void *data;
343 	int error;
344 
345 	op = NVMM_MACH_CONF_MD(args->op);
346 	if (__predict_false(op >= nvmm_impl->mach_conf_max)) {
347 		return EINVAL;
348 	}
349 
350 	allocsz = nvmm_impl->mach_conf_sizes[op];
351 	data = kmem_alloc(allocsz, KM_SLEEP);
352 
353 	error = nvmm_machine_get(owner, args->machid, &mach, true);
354 	if (error) {
355 		kmem_free(data, allocsz);
356 		return error;
357 	}
358 
359 	error = copyin(args->conf, data, allocsz);
360 	if (error) {
361 		goto out;
362 	}
363 
364 	error = (*nvmm_impl->machine_configure)(mach, op, data);
365 
366 out:
367 	nvmm_machine_put(mach);
368 	kmem_free(data, allocsz);
369 	return error;
370 }
371 
372 static int
373 nvmm_vcpu_create(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_create *args)
374 {
375 	struct nvmm_machine *mach;
376 	struct nvmm_cpu *vcpu;
377 	int error;
378 
379 	error = nvmm_machine_get(owner, args->machid, &mach, false);
380 	if (error)
381 		return error;
382 
383 	error = nvmm_vcpu_alloc(mach, args->cpuid, &vcpu);
384 	if (error)
385 		goto out;
386 
387 	/* Allocate the comm page. */
388 	uao_reference(mach->commuobj);
389 	error = uvm_map(kernel_map, (vaddr_t *)&vcpu->comm, PAGE_SIZE,
390 	    mach->commuobj, args->cpuid * PAGE_SIZE, 0, UVM_MAPFLAG(UVM_PROT_RW,
391 	    UVM_PROT_RW, UVM_INH_SHARE, UVM_ADV_RANDOM, 0));
392 	if (error) {
393 		uao_detach(mach->commuobj);
394 		nvmm_vcpu_free(mach, vcpu);
395 		nvmm_vcpu_put(vcpu);
396 		goto out;
397 	}
398 	error = uvm_map_pageable(kernel_map, (vaddr_t)vcpu->comm,
399 	    (vaddr_t)vcpu->comm + PAGE_SIZE, false, 0);
400 	if (error) {
401 		nvmm_vcpu_free(mach, vcpu);
402 		nvmm_vcpu_put(vcpu);
403 		goto out;
404 	}
405 	memset(vcpu->comm, 0, PAGE_SIZE);
406 
407 	error = (*nvmm_impl->vcpu_create)(mach, vcpu);
408 	if (error) {
409 		nvmm_vcpu_free(mach, vcpu);
410 		nvmm_vcpu_put(vcpu);
411 		goto out;
412 	}
413 
414 	nvmm_vcpu_put(vcpu);
415 
416 	atomic_inc_uint(&mach->ncpus);
417 
418 out:
419 	nvmm_machine_put(mach);
420 	return error;
421 }
422 
423 static int
424 nvmm_vcpu_destroy(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_destroy *args)
425 {
426 	struct nvmm_machine *mach;
427 	struct nvmm_cpu *vcpu;
428 	int error;
429 
430 	error = nvmm_machine_get(owner, args->machid, &mach, false);
431 	if (error)
432 		return error;
433 
434 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
435 	if (error)
436 		goto out;
437 
438 	(*nvmm_impl->vcpu_destroy)(mach, vcpu);
439 	nvmm_vcpu_free(mach, vcpu);
440 	nvmm_vcpu_put(vcpu);
441 
442 	atomic_dec_uint(&mach->ncpus);
443 
444 out:
445 	nvmm_machine_put(mach);
446 	return error;
447 }
448 
449 static int
450 nvmm_vcpu_configure(struct nvmm_owner *owner,
451     struct nvmm_ioc_vcpu_configure *args)
452 {
453 	struct nvmm_machine *mach;
454 	struct nvmm_cpu *vcpu;
455 	size_t allocsz;
456 	uint64_t op;
457 	void *data;
458 	int error;
459 
460 	op = NVMM_VCPU_CONF_MD(args->op);
461 	if (__predict_false(op >= nvmm_impl->vcpu_conf_max))
462 		return EINVAL;
463 
464 	allocsz = nvmm_impl->vcpu_conf_sizes[op];
465 	data = kmem_alloc(allocsz, KM_SLEEP);
466 
467 	error = nvmm_machine_get(owner, args->machid, &mach, false);
468 	if (error) {
469 		kmem_free(data, allocsz);
470 		return error;
471 	}
472 
473 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
474 	if (error) {
475 		nvmm_machine_put(mach);
476 		kmem_free(data, allocsz);
477 		return error;
478 	}
479 
480 	error = copyin(args->conf, data, allocsz);
481 	if (error) {
482 		goto out;
483 	}
484 
485 	error = (*nvmm_impl->vcpu_configure)(vcpu, op, data);
486 
487 out:
488 	nvmm_vcpu_put(vcpu);
489 	nvmm_machine_put(mach);
490 	kmem_free(data, allocsz);
491 	return error;
492 }
493 
494 static int
495 nvmm_vcpu_setstate(struct nvmm_owner *owner,
496     struct nvmm_ioc_vcpu_setstate *args)
497 {
498 	struct nvmm_machine *mach;
499 	struct nvmm_cpu *vcpu;
500 	int error;
501 
502 	error = nvmm_machine_get(owner, args->machid, &mach, false);
503 	if (error)
504 		return error;
505 
506 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
507 	if (error)
508 		goto out;
509 
510 	(*nvmm_impl->vcpu_setstate)(vcpu);
511 	nvmm_vcpu_put(vcpu);
512 
513 out:
514 	nvmm_machine_put(mach);
515 	return error;
516 }
517 
518 static int
519 nvmm_vcpu_getstate(struct nvmm_owner *owner,
520     struct nvmm_ioc_vcpu_getstate *args)
521 {
522 	struct nvmm_machine *mach;
523 	struct nvmm_cpu *vcpu;
524 	int error;
525 
526 	error = nvmm_machine_get(owner, args->machid, &mach, false);
527 	if (error)
528 		return error;
529 
530 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
531 	if (error)
532 		goto out;
533 
534 	(*nvmm_impl->vcpu_getstate)(vcpu);
535 	nvmm_vcpu_put(vcpu);
536 
537 out:
538 	nvmm_machine_put(mach);
539 	return error;
540 }
541 
542 static int
543 nvmm_vcpu_inject(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_inject *args)
544 {
545 	struct nvmm_machine *mach;
546 	struct nvmm_cpu *vcpu;
547 	int error;
548 
549 	error = nvmm_machine_get(owner, args->machid, &mach, false);
550 	if (error)
551 		return error;
552 
553 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
554 	if (error)
555 		goto out;
556 
557 	error = (*nvmm_impl->vcpu_inject)(vcpu);
558 	nvmm_vcpu_put(vcpu);
559 
560 out:
561 	nvmm_machine_put(mach);
562 	return error;
563 }
564 
565 static int
566 nvmm_do_vcpu_run(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
567     struct nvmm_vcpu_exit *exit)
568 {
569 	struct vmspace *vm = mach->vm;
570 	int ret;
571 
572 	while (1) {
573 		/* Got a signal? Or pending resched? Leave. */
574 		if (__predict_false(nvmm_return_needed())) {
575 			exit->reason = NVMM_VCPU_EXIT_NONE;
576 			return 0;
577 		}
578 
579 		/* Run the VCPU. */
580 		ret = (*nvmm_impl->vcpu_run)(mach, vcpu, exit);
581 		if (__predict_false(ret != 0)) {
582 			return ret;
583 		}
584 
585 		/* Process nested page faults. */
586 		if (__predict_true(exit->reason != NVMM_VCPU_EXIT_MEMORY)) {
587 			break;
588 		}
589 		if (exit->u.mem.gpa >= mach->gpa_end) {
590 			break;
591 		}
592 		if (uvm_fault(&vm->vm_map, exit->u.mem.gpa, exit->u.mem.prot)) {
593 			break;
594 		}
595 	}
596 
597 	return 0;
598 }
599 
600 static int
601 nvmm_vcpu_run(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_run *args)
602 {
603 	struct nvmm_machine *mach;
604 	struct nvmm_cpu *vcpu;
605 	int error;
606 
607 	error = nvmm_machine_get(owner, args->machid, &mach, false);
608 	if (error)
609 		return error;
610 
611 	error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
612 	if (error)
613 		goto out;
614 
615 	error = nvmm_do_vcpu_run(mach, vcpu, &args->exit);
616 	nvmm_vcpu_put(vcpu);
617 
618 out:
619 	nvmm_machine_put(mach);
620 	return error;
621 }
622 
623 /* -------------------------------------------------------------------------- */
624 
625 static struct uvm_object *
626 nvmm_hmapping_getuobj(struct nvmm_machine *mach, uintptr_t hva, size_t size,
627    size_t *off)
628 {
629 	struct nvmm_hmapping *hmapping;
630 	size_t i;
631 
632 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
633 		hmapping = &mach->hmap[i];
634 		if (!hmapping->present) {
635 			continue;
636 		}
637 		if (hva >= hmapping->hva &&
638 		    hva + size <= hmapping->hva + hmapping->size) {
639 			*off = hva - hmapping->hva;
640 			return hmapping->uobj;
641 		}
642 	}
643 
644 	return NULL;
645 }
646 
647 static int
648 nvmm_hmapping_validate(struct nvmm_machine *mach, uintptr_t hva, size_t size)
649 {
650 	struct nvmm_hmapping *hmapping;
651 	size_t i;
652 
653 	if ((hva % PAGE_SIZE) != 0 || (size % PAGE_SIZE) != 0) {
654 		return EINVAL;
655 	}
656 	if (hva == 0) {
657 		return EINVAL;
658 	}
659 
660 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
661 		hmapping = &mach->hmap[i];
662 		if (!hmapping->present) {
663 			continue;
664 		}
665 
666 		if (hva >= hmapping->hva &&
667 		    hva + size <= hmapping->hva + hmapping->size) {
668 			break;
669 		}
670 
671 		if (hva >= hmapping->hva &&
672 		    hva < hmapping->hva + hmapping->size) {
673 			return EEXIST;
674 		}
675 		if (hva + size > hmapping->hva &&
676 		    hva + size <= hmapping->hva + hmapping->size) {
677 			return EEXIST;
678 		}
679 		if (hva <= hmapping->hva &&
680 		    hva + size >= hmapping->hva + hmapping->size) {
681 			return EEXIST;
682 		}
683 	}
684 
685 	return 0;
686 }
687 
688 static struct nvmm_hmapping *
689 nvmm_hmapping_alloc(struct nvmm_machine *mach)
690 {
691 	struct nvmm_hmapping *hmapping;
692 	size_t i;
693 
694 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
695 		hmapping = &mach->hmap[i];
696 		if (!hmapping->present) {
697 			hmapping->present = true;
698 			return hmapping;
699 		}
700 	}
701 
702 	return NULL;
703 }
704 
705 static int
706 nvmm_hmapping_free(struct nvmm_machine *mach, uintptr_t hva, size_t size)
707 {
708 	struct vmspace *vmspace = curproc->p_vmspace;
709 	struct nvmm_hmapping *hmapping;
710 	size_t i;
711 
712 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
713 		hmapping = &mach->hmap[i];
714 		if (!hmapping->present || hmapping->hva != hva ||
715 		    hmapping->size != size) {
716 			continue;
717 		}
718 
719 		uvm_unmap(&vmspace->vm_map, hmapping->hva,
720 		    hmapping->hva + hmapping->size);
721 		uao_detach(hmapping->uobj);
722 
723 		hmapping->uobj = NULL;
724 		hmapping->present = false;
725 
726 		return 0;
727 	}
728 
729 	return ENOENT;
730 }
731 
732 static int
733 nvmm_hva_map(struct nvmm_owner *owner, struct nvmm_ioc_hva_map *args)
734 {
735 	struct vmspace *vmspace = curproc->p_vmspace;
736 	struct nvmm_machine *mach;
737 	struct nvmm_hmapping *hmapping;
738 	vaddr_t uva;
739 	int error;
740 
741 	error = nvmm_machine_get(owner, args->machid, &mach, true);
742 	if (error)
743 		return error;
744 
745 	error = nvmm_hmapping_validate(mach, args->hva, args->size);
746 	if (error)
747 		goto out;
748 
749 	hmapping = nvmm_hmapping_alloc(mach);
750 	if (hmapping == NULL) {
751 		error = ENOBUFS;
752 		goto out;
753 	}
754 
755 	hmapping->hva = args->hva;
756 	hmapping->size = args->size;
757 	hmapping->uobj = uao_create(hmapping->size, 0);
758 	uva = hmapping->hva;
759 
760 	/* Take a reference for the user. */
761 	uao_reference(hmapping->uobj);
762 
763 	/* Map the uobj into the user address space, as pageable. */
764 	error = uvm_map(&vmspace->vm_map, &uva, hmapping->size, hmapping->uobj,
765 	    0, 0, UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, UVM_INH_SHARE,
766 	    UVM_ADV_RANDOM, UVM_FLAG_FIXED|UVM_FLAG_UNMAP));
767 	if (error) {
768 		uao_detach(hmapping->uobj);
769 	}
770 
771 out:
772 	nvmm_machine_put(mach);
773 	return error;
774 }
775 
776 static int
777 nvmm_hva_unmap(struct nvmm_owner *owner, struct nvmm_ioc_hva_unmap *args)
778 {
779 	struct nvmm_machine *mach;
780 	int error;
781 
782 	error = nvmm_machine_get(owner, args->machid, &mach, true);
783 	if (error)
784 		return error;
785 
786 	error = nvmm_hmapping_free(mach, args->hva, args->size);
787 
788 	nvmm_machine_put(mach);
789 	return error;
790 }
791 
792 /* -------------------------------------------------------------------------- */
793 
794 static int
795 nvmm_gpa_map(struct nvmm_owner *owner, struct nvmm_ioc_gpa_map *args)
796 {
797 	struct nvmm_machine *mach;
798 	struct uvm_object *uobj;
799 	gpaddr_t gpa;
800 	size_t off;
801 	int error;
802 
803 	error = nvmm_machine_get(owner, args->machid, &mach, false);
804 	if (error)
805 		return error;
806 
807 	if ((args->prot & ~(PROT_READ|PROT_WRITE|PROT_EXEC)) != 0) {
808 		error = EINVAL;
809 		goto out;
810 	}
811 
812 	if ((args->gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0 ||
813 	    (args->hva % PAGE_SIZE) != 0) {
814 		error = EINVAL;
815 		goto out;
816 	}
817 	if (args->hva == 0) {
818 		error = EINVAL;
819 		goto out;
820 	}
821 	if (args->gpa < mach->gpa_begin || args->gpa >= mach->gpa_end) {
822 		error = EINVAL;
823 		goto out;
824 	}
825 	if (args->gpa + args->size <= args->gpa) {
826 		error = EINVAL;
827 		goto out;
828 	}
829 	if (args->gpa + args->size > mach->gpa_end) {
830 		error = EINVAL;
831 		goto out;
832 	}
833 	gpa = args->gpa;
834 
835 	uobj = nvmm_hmapping_getuobj(mach, args->hva, args->size, &off);
836 	if (uobj == NULL) {
837 		error = EINVAL;
838 		goto out;
839 	}
840 
841 	/* Take a reference for the machine. */
842 	uao_reference(uobj);
843 
844 	/* Map the uobj into the machine address space, as pageable. */
845 	error = uvm_map(&mach->vm->vm_map, &gpa, args->size, uobj, off, 0,
846 	    UVM_MAPFLAG(args->prot, UVM_PROT_RWX, UVM_INH_NONE,
847 	    UVM_ADV_RANDOM, UVM_FLAG_FIXED|UVM_FLAG_UNMAP));
848 	if (error) {
849 		uao_detach(uobj);
850 		goto out;
851 	}
852 	if (gpa != args->gpa) {
853 		uao_detach(uobj);
854 		printf("[!] uvm_map problem\n");
855 		error = EINVAL;
856 		goto out;
857 	}
858 
859 out:
860 	nvmm_machine_put(mach);
861 	return error;
862 }
863 
864 static int
865 nvmm_gpa_unmap(struct nvmm_owner *owner, struct nvmm_ioc_gpa_unmap *args)
866 {
867 	struct nvmm_machine *mach;
868 	gpaddr_t gpa;
869 	int error;
870 
871 	error = nvmm_machine_get(owner, args->machid, &mach, false);
872 	if (error)
873 		return error;
874 
875 	if ((args->gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0) {
876 		error = EINVAL;
877 		goto out;
878 	}
879 	if (args->gpa < mach->gpa_begin || args->gpa >= mach->gpa_end) {
880 		error = EINVAL;
881 		goto out;
882 	}
883 	if (args->gpa + args->size <= args->gpa) {
884 		error = EINVAL;
885 		goto out;
886 	}
887 	if (args->gpa + args->size >= mach->gpa_end) {
888 		error = EINVAL;
889 		goto out;
890 	}
891 	gpa = args->gpa;
892 
893 	/* Unmap the memory from the machine. */
894 	uvm_unmap(&mach->vm->vm_map, gpa, gpa + args->size);
895 
896 out:
897 	nvmm_machine_put(mach);
898 	return error;
899 }
900 
901 /* -------------------------------------------------------------------------- */
902 
903 static int
904 nvmm_ctl_mach_info(struct nvmm_owner *owner, struct nvmm_ioc_ctl *args)
905 {
906 	struct nvmm_ctl_mach_info ctl;
907 	struct nvmm_machine *mach;
908 	struct nvmm_cpu *vcpu;
909 	int error;
910 	size_t i;
911 
912 	if (args->size != sizeof(ctl))
913 		return EINVAL;
914 	error = copyin(args->data, &ctl, sizeof(ctl));
915 	if (error)
916 		return error;
917 
918 	error = nvmm_machine_get(owner, ctl.machid, &mach, true);
919 	if (error)
920 		return error;
921 
922 	ctl.nvcpus = 0;
923 	for (i = 0; i < NVMM_MAX_VCPUS; i++) {
924 		error = nvmm_vcpu_get(mach, i, &vcpu);
925 		if (error)
926 			continue;
927 		ctl.nvcpus++;
928 		nvmm_vcpu_put(vcpu);
929 	}
930 
931 	ctl.nram = 0;
932 	for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
933 		if (!mach->hmap[i].present)
934 			continue;
935 		ctl.nram += mach->hmap[i].size;
936 	}
937 
938 	ctl.pid = mach->owner->pid;
939 	ctl.time = mach->time;
940 
941 	nvmm_machine_put(mach);
942 
943 	error = copyout(&ctl, args->data, sizeof(ctl));
944 	if (error)
945 		return error;
946 
947 	return 0;
948 }
949 
950 static int
951 nvmm_ctl(struct nvmm_owner *owner, struct nvmm_ioc_ctl *args)
952 {
953 	switch (args->op) {
954 	case NVMM_CTL_MACH_INFO:
955 		return nvmm_ctl_mach_info(owner, args);
956 	default:
957 		return EINVAL;
958 	}
959 }
960 
961 /* -------------------------------------------------------------------------- */
962 
963 static const struct nvmm_impl *
964 nvmm_ident(void)
965 {
966 	size_t i;
967 
968 	for (i = 0; i < __arraycount(nvmm_impl_list); i++) {
969 		if ((*nvmm_impl_list[i]->ident)())
970 			return nvmm_impl_list[i];
971 	}
972 
973 	return NULL;
974 }
975 
976 static int
977 nvmm_init(void)
978 {
979 	size_t i, n;
980 
981 	nvmm_impl = nvmm_ident();
982 	if (nvmm_impl == NULL)
983 		return ENOTSUP;
984 
985 	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
986 		machines[i].machid = i;
987 		rw_init(&machines[i].lock);
988 		for (n = 0; n < NVMM_MAX_VCPUS; n++) {
989 			machines[i].cpus[n].present = false;
990 			machines[i].cpus[n].cpuid = n;
991 			mutex_init(&machines[i].cpus[n].lock, MUTEX_DEFAULT,
992 			    IPL_NONE);
993 		}
994 	}
995 
996 	(*nvmm_impl->init)();
997 
998 	return 0;
999 }
1000 
1001 static void
1002 nvmm_fini(void)
1003 {
1004 	size_t i, n;
1005 
1006 	for (i = 0; i < NVMM_MAX_MACHINES; i++) {
1007 		rw_destroy(&machines[i].lock);
1008 		for (n = 0; n < NVMM_MAX_VCPUS; n++) {
1009 			mutex_destroy(&machines[i].cpus[n].lock);
1010 		}
1011 	}
1012 
1013 	(*nvmm_impl->fini)();
1014 	nvmm_impl = NULL;
1015 }
1016 
1017 /* -------------------------------------------------------------------------- */
1018 
1019 static dev_type_open(nvmm_open);
1020 
1021 const struct cdevsw nvmm_cdevsw = {
1022 	.d_open = nvmm_open,
1023 	.d_close = noclose,
1024 	.d_read = noread,
1025 	.d_write = nowrite,
1026 	.d_ioctl = noioctl,
1027 	.d_stop = nostop,
1028 	.d_tty = notty,
1029 	.d_poll = nopoll,
1030 	.d_mmap = nommap,
1031 	.d_kqfilter = nokqfilter,
1032 	.d_discard = nodiscard,
1033 	.d_flag = D_OTHER | D_MPSAFE
1034 };
1035 
1036 static int nvmm_ioctl(file_t *, u_long, void *);
1037 static int nvmm_close(file_t *);
1038 static int nvmm_mmap(file_t *, off_t *, size_t, int, int *, int *,
1039     struct uvm_object **, int *);
1040 
1041 const struct fileops nvmm_fileops = {
1042 	.fo_read = fbadop_read,
1043 	.fo_write = fbadop_write,
1044 	.fo_ioctl = nvmm_ioctl,
1045 	.fo_fcntl = fnullop_fcntl,
1046 	.fo_poll = fnullop_poll,
1047 	.fo_stat = fbadop_stat,
1048 	.fo_close = nvmm_close,
1049 	.fo_kqfilter = fnullop_kqfilter,
1050 	.fo_restart = fnullop_restart,
1051 	.fo_mmap = nvmm_mmap,
1052 };
1053 
1054 static int
1055 nvmm_open(dev_t dev, int flags, int type, struct lwp *l)
1056 {
1057 	struct nvmm_owner *owner;
1058 	struct file *fp;
1059 	int error, fd;
1060 
1061 	if (__predict_false(nvmm_impl == NULL))
1062 		return ENXIO;
1063 	if (minor(dev) != 0)
1064 		return EXDEV;
1065 	if (!(flags & O_CLOEXEC))
1066 		return EINVAL;
1067 	error = fd_allocfile(&fp, &fd);
1068 	if (error)
1069 		return error;
1070 
1071 	if (OFLAGS(flags) & O_WRONLY) {
1072 		owner = &root_owner;
1073 	} else {
1074 		owner = kmem_alloc(sizeof(*owner), KM_SLEEP);
1075 		owner->pid = l->l_proc->p_pid;
1076 	}
1077 
1078 	return fd_clone(fp, fd, flags, &nvmm_fileops, owner);
1079 }
1080 
1081 static int
1082 nvmm_close(file_t *fp)
1083 {
1084 	struct nvmm_owner *owner = fp->f_data;
1085 
1086 	KASSERT(owner != NULL);
1087 	nvmm_kill_machines(owner);
1088 	if (owner != &root_owner) {
1089 		kmem_free(owner, sizeof(*owner));
1090 	}
1091 	fp->f_data = NULL;
1092 
1093    	return 0;
1094 }
1095 
1096 static int
1097 nvmm_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp,
1098     int *advicep, struct uvm_object **uobjp, int *maxprotp)
1099 {
1100 	struct nvmm_owner *owner = fp->f_data;
1101 	struct nvmm_machine *mach;
1102 	nvmm_machid_t machid;
1103 	nvmm_cpuid_t cpuid;
1104 	int error;
1105 
1106 	if (prot & PROT_EXEC)
1107 		return EACCES;
1108 	if (size != PAGE_SIZE)
1109 		return EINVAL;
1110 
1111 	cpuid = NVMM_COMM_CPUID(*offp);
1112 	if (__predict_false(cpuid >= NVMM_MAX_VCPUS))
1113 		return EINVAL;
1114 
1115 	machid = NVMM_COMM_MACHID(*offp);
1116 	error = nvmm_machine_get(owner, machid, &mach, false);
1117 	if (error)
1118 		return error;
1119 
1120 	uao_reference(mach->commuobj);
1121 	*uobjp = mach->commuobj;
1122 	*offp = cpuid * PAGE_SIZE;
1123 	*maxprotp = prot;
1124 	*advicep = UVM_ADV_RANDOM;
1125 
1126 	nvmm_machine_put(mach);
1127 	return 0;
1128 }
1129 
1130 static int
1131 nvmm_ioctl(file_t *fp, u_long cmd, void *data)
1132 {
1133 	struct nvmm_owner *owner = fp->f_data;
1134 
1135 	KASSERT(owner != NULL);
1136 
1137 	switch (cmd) {
1138 	case NVMM_IOC_CAPABILITY:
1139 		return nvmm_capability(owner, data);
1140 	case NVMM_IOC_MACHINE_CREATE:
1141 		return nvmm_machine_create(owner, data);
1142 	case NVMM_IOC_MACHINE_DESTROY:
1143 		return nvmm_machine_destroy(owner, data);
1144 	case NVMM_IOC_MACHINE_CONFIGURE:
1145 		return nvmm_machine_configure(owner, data);
1146 	case NVMM_IOC_VCPU_CREATE:
1147 		return nvmm_vcpu_create(owner, data);
1148 	case NVMM_IOC_VCPU_DESTROY:
1149 		return nvmm_vcpu_destroy(owner, data);
1150 	case NVMM_IOC_VCPU_CONFIGURE:
1151 		return nvmm_vcpu_configure(owner, data);
1152 	case NVMM_IOC_VCPU_SETSTATE:
1153 		return nvmm_vcpu_setstate(owner, data);
1154 	case NVMM_IOC_VCPU_GETSTATE:
1155 		return nvmm_vcpu_getstate(owner, data);
1156 	case NVMM_IOC_VCPU_INJECT:
1157 		return nvmm_vcpu_inject(owner, data);
1158 	case NVMM_IOC_VCPU_RUN:
1159 		return nvmm_vcpu_run(owner, data);
1160 	case NVMM_IOC_GPA_MAP:
1161 		return nvmm_gpa_map(owner, data);
1162 	case NVMM_IOC_GPA_UNMAP:
1163 		return nvmm_gpa_unmap(owner, data);
1164 	case NVMM_IOC_HVA_MAP:
1165 		return nvmm_hva_map(owner, data);
1166 	case NVMM_IOC_HVA_UNMAP:
1167 		return nvmm_hva_unmap(owner, data);
1168 	case NVMM_IOC_CTL:
1169 		return nvmm_ctl(owner, data);
1170 	default:
1171 		return EINVAL;
1172 	}
1173 }
1174 
1175 /* -------------------------------------------------------------------------- */
1176 
1177 static int nvmm_match(device_t, cfdata_t, void *);
1178 static void nvmm_attach(device_t, device_t, void *);
1179 static int nvmm_detach(device_t, int);
1180 
1181 extern struct cfdriver nvmm_cd;
1182 
1183 CFATTACH_DECL_NEW(nvmm, 0, nvmm_match, nvmm_attach, nvmm_detach, NULL);
1184 
1185 static struct cfdata nvmm_cfdata[] = {
1186 	{
1187 		.cf_name = "nvmm",
1188 		.cf_atname = "nvmm",
1189 		.cf_unit = 0,
1190 		.cf_fstate = FSTATE_STAR,
1191 		.cf_loc = NULL,
1192 		.cf_flags = 0,
1193 		.cf_pspec = NULL,
1194 	},
1195 	{ NULL, NULL, 0, FSTATE_NOTFOUND, NULL, 0, NULL }
1196 };
1197 
1198 static int
1199 nvmm_match(device_t self, cfdata_t cfdata, void *arg)
1200 {
1201 	return 1;
1202 }
1203 
1204 static void
1205 nvmm_attach(device_t parent, device_t self, void *aux)
1206 {
1207 	int error;
1208 
1209 	error = nvmm_init();
1210 	if (error)
1211 		panic("%s: impossible", __func__);
1212 	aprint_normal_dev(self, "attached, using backend %s\n",
1213 	    nvmm_impl->name);
1214 }
1215 
1216 static int
1217 nvmm_detach(device_t self, int flags)
1218 {
1219 	if (nmachines > 0)
1220 		return EBUSY;
1221 	nvmm_fini();
1222 	return 0;
1223 }
1224 
1225 void
1226 nvmmattach(int nunits)
1227 {
1228 	/* nothing */
1229 }
1230 
1231 MODULE(MODULE_CLASS_MISC, nvmm, NULL);
1232 
1233 #if defined(_MODULE)
1234 CFDRIVER_DECL(nvmm, DV_VIRTUAL, NULL);
1235 #endif
1236 
1237 static int
1238 nvmm_modcmd(modcmd_t cmd, void *arg)
1239 {
1240 #if defined(_MODULE)
1241 	devmajor_t bmajor = NODEVMAJOR;
1242 	devmajor_t cmajor = 345;
1243 #endif
1244 	int error;
1245 
1246 	switch (cmd) {
1247 	case MODULE_CMD_INIT:
1248 		if (nvmm_ident() == NULL) {
1249 			aprint_error("%s: cpu not supported\n",
1250 			    nvmm_cd.cd_name);
1251 			return ENOTSUP;
1252 		}
1253 #if defined(_MODULE)
1254 		error = config_cfdriver_attach(&nvmm_cd);
1255 		if (error)
1256 			return error;
1257 #endif
1258 		error = config_cfattach_attach(nvmm_cd.cd_name, &nvmm_ca);
1259 		if (error) {
1260 			config_cfdriver_detach(&nvmm_cd);
1261 			aprint_error("%s: config_cfattach_attach failed\n",
1262 			    nvmm_cd.cd_name);
1263 			return error;
1264 		}
1265 
1266 		error = config_cfdata_attach(nvmm_cfdata, 1);
1267 		if (error) {
1268 			config_cfattach_detach(nvmm_cd.cd_name, &nvmm_ca);
1269 			config_cfdriver_detach(&nvmm_cd);
1270 			aprint_error("%s: unable to register cfdata\n",
1271 			    nvmm_cd.cd_name);
1272 			return error;
1273 		}
1274 
1275 		if (config_attach_pseudo(nvmm_cfdata) == NULL) {
1276 			aprint_error("%s: config_attach_pseudo failed\n",
1277 			    nvmm_cd.cd_name);
1278 			config_cfattach_detach(nvmm_cd.cd_name, &nvmm_ca);
1279 			config_cfdriver_detach(&nvmm_cd);
1280 			return ENXIO;
1281 		}
1282 
1283 #if defined(_MODULE)
1284 		/* mknod /dev/nvmm c 345 0 */
1285 		error = devsw_attach(nvmm_cd.cd_name, NULL, &bmajor,
1286 			&nvmm_cdevsw, &cmajor);
1287 		if (error) {
1288 			aprint_error("%s: unable to register devsw\n",
1289 			    nvmm_cd.cd_name);
1290 			config_cfattach_detach(nvmm_cd.cd_name, &nvmm_ca);
1291 			config_cfdriver_detach(&nvmm_cd);
1292 			return error;
1293 		}
1294 #endif
1295 		return 0;
1296 	case MODULE_CMD_FINI:
1297 		error = config_cfdata_detach(nvmm_cfdata);
1298 		if (error)
1299 			return error;
1300 		error = config_cfattach_detach(nvmm_cd.cd_name, &nvmm_ca);
1301 		if (error)
1302 			return error;
1303 #if defined(_MODULE)
1304 		config_cfdriver_detach(&nvmm_cd);
1305 		devsw_detach(NULL, &nvmm_cdevsw);
1306 #endif
1307 		return 0;
1308 	case MODULE_CMD_AUTOUNLOAD:
1309 		return EBUSY;
1310 	default:
1311 		return ENOTTY;
1312 	}
1313 }
1314