xref: /netbsd-src/sys/arch/xen/x86/hypervisor_machdep.c (revision 867d70fc718005c0918b8b8b2f9d7f2d52d0a0db)
1 /*	$NetBSD: hypervisor_machdep.c,v 1.45 2022/09/07 00:40:19 knakahara Exp $	*/
2 
3 /*
4  *
5  * Copyright (c) 2004 Christian Limpach.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /******************************************************************************
30  * hypervisor.c
31  *
32  * Communication to/from hypervisor.
33  *
34  * Copyright (c) 2002-2004, K A Fraser
35  *
36  * Permission is hereby granted, free of charge, to any person obtaining a copy
37  * of this software and associated documentation files (the "Software"), to
38  * deal in the Software without restriction, including without limitation the
39  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
40  * sell copies of the Software, and to permit persons to whom the Software is
41  * furnished to do so, subject to the following conditions:
42  *
43  * The above copyright notice and this permission notice shall be included in
44  * all copies or substantial portions of the Software.
45  *
46  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
48  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
49  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
51  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
52  * DEALINGS IN THE SOFTWARE.
53  */
54 
55 
56 #include <sys/cdefs.h>
57 __KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.45 2022/09/07 00:40:19 knakahara Exp $");
58 
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/kmem.h>
62 #include <sys/cpu.h>
63 #include <sys/ksyms.h>
64 
65 #include <uvm/uvm_extern.h>
66 
67 #include <machine/vmparam.h>
68 #include <machine/pmap.h>
69 #include <machine/pmap_private.h>
70 
71 #include <x86/machdep.h>
72 #include <x86/cpuvar.h>
73 
74 #include <xen/xen.h>
75 #include <xen/intr.h>
76 #include <xen/hypervisor.h>
77 #include <xen/evtchn.h>
78 #include <xen/xenpmap.h>
79 
80 #include "opt_xen.h"
81 #include "opt_modular.h"
82 #include "opt_ddb.h"
83 #include "isa.h"
84 #include "pci.h"
85 #include "ksyms.h"
86 
87 #ifdef DDB
88 #include <machine/db_machdep.h>
89 #include <ddb/db_extern.h>
90 #include <ddb/db_output.h>
91 #include <ddb/db_interface.h>
92 #endif
93 
94 #ifdef XENPV
95 /*
96  * arch-dependent p2m frame lists list (L3 and L2)
97  * used by Xen for save/restore mappings
98  */
99 static unsigned long * l3_p2m_page;
100 static unsigned long * l2_p2m_page;
101 static int l2_p2m_page_size; /* size of L2 page, in pages */
102 
103 static void build_p2m_frame_list_list(void);
104 static void update_p2m_frame_list_list(void);
105 
106 #endif
107 
108 // #define PORT_DEBUG 4
109 // #define EARLY_DEBUG_EVENT
110 
111 /* callback function type */
112 typedef void (*iterate_func_t)(unsigned int, unsigned int,
113 			       unsigned int, void *);
114 
115 static inline void
116 evt_iterate_bits(volatile unsigned long *pendingl1,
117 		 volatile unsigned long *pendingl2,
118 		 volatile unsigned long *mask,
119 		 iterate_func_t iterate_pending, void *iterate_args)
120 {
121 
122 	KASSERT(pendingl1 != NULL);
123 	KASSERT(pendingl2 != NULL);
124 
125 	unsigned long l1, l2;
126 	unsigned int l1i, l2i, port;
127 
128 	l1 = xen_atomic_xchg(pendingl1, 0);
129 	while ((l1i = xen_ffs(l1)) != 0) {
130 		l1i--;
131 		l1 &= ~(1UL << l1i);
132 
133 		l2 = pendingl2[l1i] & (mask != NULL ? ~mask[l1i] : -1UL);
134 		l2 &= curcpu()->ci_evtmask[l1i];
135 
136 		if (mask != NULL) xen_atomic_setbits_l(&mask[l1i], l2);
137 		xen_atomic_clearbits_l(&pendingl2[l1i], l2);
138 
139 		while ((l2i = xen_ffs(l2)) != 0) {
140 			l2i--;
141 			l2 &= ~(1UL << l2i);
142 
143 			port = (l1i << LONG_SHIFT) + l2i;
144 
145 			iterate_pending(port, l1i, l2i, iterate_args);
146 		}
147 	}
148 }
149 
150 /*
151  * Set per-cpu "pending" information for outstanding events that
152  * cannot be processed now.
153  */
154 
155 static inline void
156 evt_set_pending(unsigned int port, unsigned int l1i,
157 		unsigned int l2i, void *args)
158 {
159 
160 	KASSERT(args != NULL);
161 
162 	int *ret = args;
163 	struct intrhand *ih;
164 
165 	if (evtsource[port]) {
166 		hypervisor_set_ipending(evtsource[port]->ev_imask, l1i, l2i);
167 		evtsource[port]->ev_evcnt.ev_count++;
168 		ih = evtsource[port]->ev_handlers;
169 		while (ih != NULL) {
170 			ih->ih_pending++;
171 			ih = ih->ih_evt_next;
172 		}
173 
174 		if (*ret == 0 && curcpu()->ci_ilevel <
175 		    evtsource[port]->ev_maxlevel)
176 			*ret = 1;
177 	}
178 #ifdef DOM0OPS
179 	else  {
180 		/* set pending event */
181 		xenevt_setipending(l1i, l2i);
182 	}
183 #endif
184 }
185 
186 int stipending(void);
187 int
188 stipending(void)
189 {
190 	volatile shared_info_t *s = HYPERVISOR_shared_info;
191 	struct cpu_info *ci;
192 	volatile struct vcpu_info *vci;
193 	int ret;
194 
195 	ret = 0;
196 	ci = curcpu();
197 	vci = ci->ci_vcpu;
198 
199 #if 0
200 	if (HYPERVISOR_shared_info->events)
201 		printf("stipending events %08lx mask %08lx ilevel %d\n",
202 		    HYPERVISOR_shared_info->events,
203 		    HYPERVISOR_shared_info->events_mask, ci->ci_ilevel);
204 #endif
205 
206 #ifdef EARLY_DEBUG_EVENT
207 	if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) {
208 		xen_debug_handler(NULL);
209 		xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port);
210 	}
211 #endif
212 
213 	/*
214 	 * we're only called after STIC, so we know that we'll have to
215 	 * STI at the end
216 	 */
217 
218 	while (vci->evtchn_upcall_pending) {
219 		x86_disable_intr();
220 
221 		vci->evtchn_upcall_pending = 0;
222 
223 		evt_iterate_bits(&vci->evtchn_pending_sel,
224 		    s->evtchn_pending, s->evtchn_mask,
225 		    evt_set_pending, &ret);
226 
227 		x86_enable_intr();
228 	}
229 
230 	return (ret);
231 }
232 
233 /* Iterate through pending events and call the event handler */
234 
235 static inline void
236 evt_do_hypervisor_callback(unsigned int port, unsigned int l1i,
237 			   unsigned int l2i, void *args)
238 {
239 	KASSERT(args != NULL);
240 
241 #ifdef DOM0OPS
242 	struct cpu_info *ci = curcpu();
243 #endif
244 	struct intrframe *regs = args;
245 
246 #ifdef PORT_DEBUG
247 	if (port == PORT_DEBUG)
248 		printf("do_hypervisor_callback event %d\n", port);
249 #endif
250 	if (evtsource[port]) {
251 		KASSERT(cpu_intr_p());
252 		evtchn_do_event(port, regs);
253 	}
254 #ifdef DOM0OPS
255 	else  {
256 		if (ci->ci_ilevel < IPL_HIGH) {
257 			/* fast path */
258 			int oipl = ci->ci_ilevel;
259 			ci->ci_ilevel = IPL_HIGH;
260 			KASSERT(cpu_intr_p());
261 			xenevt_event(port);
262 			ci->ci_ilevel = oipl;
263 		} else {
264 			/* set pending event */
265 			xenevt_setipending(l1i, l2i);
266 		}
267 	}
268 #endif
269 }
270 
271 void
272 do_hypervisor_callback(struct intrframe *regs)
273 {
274 	volatile shared_info_t *s = HYPERVISOR_shared_info;
275 	struct cpu_info *ci;
276 	volatile struct vcpu_info *vci;
277 	uint64_t level __diagused;
278 
279 	ci = curcpu();
280 	vci = ci->ci_vcpu;
281 	level = ci->ci_ilevel;
282 
283 	/* Save trapframe for clock handler */
284 	KASSERT(regs != NULL);
285 	ci->ci_xen_clockf_usermode = USERMODE(regs->_INTRFRAME_CS);
286 	ci->ci_xen_clockf_pc = regs->_INTRFRAME_IP;
287 
288 	// DDD printf("do_hypervisor_callback\n");
289 
290 #ifdef EARLY_DEBUG_EVENT
291 	if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) {
292 		xen_debug_handler(NULL);
293 		xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port);
294 	}
295 #endif
296 
297 	while (vci->evtchn_upcall_pending) {
298 		vci->evtchn_upcall_pending = 0;
299 
300 		evt_iterate_bits(&vci->evtchn_pending_sel,
301 		    s->evtchn_pending, s->evtchn_mask,
302 		    evt_do_hypervisor_callback, regs);
303 	}
304 
305 #ifdef DIAGNOSTIC
306 	if (level != ci->ci_ilevel)
307 		printf("hypervisor done %08x level %" PRIu64 "/%" PRIu64 " ipending %0" PRIx64 "\n",
308 		    (uint)vci->evtchn_pending_sel,
309 		    level, (uint64_t)ci->ci_ilevel, (uint64_t)ci->ci_ipending);
310 #endif
311 }
312 
313 #if 0
314 void
315 hypervisor_send_event(struct cpu_info *ci, unsigned int ev)
316 {
317 	KASSERT(ci != NULL);
318 
319 	volatile shared_info_t *s = HYPERVISOR_shared_info;
320 	volatile struct vcpu_info *vci = ci->ci_vcpu;
321 
322 #ifdef PORT_DEBUG
323 	if (ev == PORT_DEBUG)
324 		printf("hypervisor_send_event %d\n", ev);
325 #endif
326 
327 	xen_atomic_set_bit(&s->evtchn_pending[0], ev);
328 
329 	if (__predict_false(ci == curcpu())) {
330 		xen_atomic_set_bit(&vci->evtchn_pending_sel,
331 		    ev >> LONG_SHIFT);
332 		xen_atomic_set_bit(&vci->evtchn_upcall_pending, 0);
333 	}
334 
335 	xen_atomic_clear_bit(&s->evtchn_mask[0], ev);
336 
337 	if (__predict_true(ci == curcpu())) {
338 		hypervisor_force_callback();
339 	} else {
340 		if (__predict_false(xen_send_ipi(ci, XEN_IPI_HVCB))) {
341 			panic("xen_send_ipi(cpu%d id %d, XEN_IPI_HVCB) failed\n",
342 			    (int) ci->ci_cpuid, ci->ci_vcpuid);
343 		}
344 	}
345 }
346 #endif
347 
348 void
349 hypervisor_unmask_event(unsigned int ev)
350 {
351 
352 	KASSERT(ev > 0 && ev < NR_EVENT_CHANNELS);
353 
354 #ifdef PORT_DEBUG
355 	if (ev == PORT_DEBUG)
356 		printf("hypervisor_unmask_event %d\n", ev);
357 #endif
358 
359 	/* Xen unmasks the evtchn_mask[0]:ev bit for us. */
360 	evtchn_op_t op;
361 	op.cmd = EVTCHNOP_unmask;
362 	op.u.unmask.port = ev;
363 	if (HYPERVISOR_event_channel_op(&op) != 0)
364 		panic("Failed to unmask event %d\n", ev);
365 
366 	return;
367 }
368 
369 void
370 hypervisor_mask_event(unsigned int ev)
371 {
372 	volatile shared_info_t *s = HYPERVISOR_shared_info;
373 #ifdef PORT_DEBUG
374 	if (ev == PORT_DEBUG)
375 		printf("hypervisor_mask_event %d\n", ev);
376 #endif
377 
378 	xen_atomic_set_bit(&s->evtchn_mask[0], ev);
379 }
380 
381 void
382 hypervisor_clear_event(unsigned int ev)
383 {
384 	volatile shared_info_t *s = HYPERVISOR_shared_info;
385 #ifdef PORT_DEBUG
386 	if (ev == PORT_DEBUG)
387 		printf("hypervisor_clear_event %d\n", ev);
388 #endif
389 
390 	xen_atomic_clear_bit(&s->evtchn_pending[0], ev);
391 }
392 
393 static inline void
394 evt_enable_event(unsigned int port, unsigned int l1i,
395 		 unsigned int l2i, void *args)
396 {
397 	KASSERT(args == NULL);
398 	hypervisor_unmask_event(port);
399 #if defined(XENPV) && (NPCI > 0 || NISA > 0)
400 	hypervisor_ack_pirq_event(port);
401 #endif /* NPCI > 0 || NISA > 0 */
402 }
403 
404 void
405 hypervisor_enable_sir(unsigned int sir)
406 {
407 	struct cpu_info *ci = curcpu();
408 
409 	/*
410 	 * enable all events for ipl. As we only set an event in ipl_evt_mask
411 	 * for its lowest IPL, and pending IPLs are processed high to low,
412 	 * we know that all callback for this event have been processed.
413 	 */
414 
415 	evt_iterate_bits(&ci->ci_isources[sir]->ipl_evt_mask1,
416 	    ci->ci_isources[sir]->ipl_evt_mask2, NULL,
417 	    evt_enable_event, NULL);
418 
419 }
420 
421 void
422 hypervisor_set_ipending(uint64_t imask, int l1, int l2)
423 {
424 
425 	/* This function is not re-entrant */
426 	KASSERT(x86_read_psl() != 0);
427 
428 	int sir;
429 	struct cpu_info *ci = curcpu();
430 
431 	/* set pending bit for the appropriate IPLs */
432 	ci->ci_ipending |= imask;
433 
434 	/*
435 	 * And set event pending bit for the lowest IPL. As IPL are handled
436 	 * from high to low, this ensure that all callbacks will have been
437 	 * called when we ack the event
438 	 */
439 	sir = ffs(imask);
440 	KASSERT(sir > SIR_XENIPL_VM);
441 	sir--;
442 	KASSERT(sir <= SIR_XENIPL_HIGH);
443 	KASSERT(ci->ci_isources[sir] != NULL);
444 	ci->ci_isources[sir]->ipl_evt_mask1 |= 1UL << l1;
445 	ci->ci_isources[sir]->ipl_evt_mask2[l1] |= 1UL << l2;
446 	KASSERT(ci == curcpu());
447 #if 0
448 	if (__predict_false(ci != curcpu())) {
449 		if (xen_send_ipi(ci, XEN_IPI_HVCB)) {
450 			panic("hypervisor_set_ipending: "
451 			    "xen_send_ipi(cpu%d id %d, XEN_IPI_HVCB) failed\n",
452 			    (int) ci->ci_cpuid, ci->ci_vcpuid);
453 		}
454 	}
455 #endif
456 }
457 
458 void
459 hypervisor_machdep_attach(void)
460 {
461 #ifdef XENPV
462  	/* dom0 does not require the arch-dependent P2M translation table */
463 	if (!xendomain_is_dom0()) {
464 		build_p2m_frame_list_list();
465 		sysctl_xen_suspend_setup();
466 	}
467 #endif
468 }
469 
470 void
471 hypervisor_machdep_resume(void)
472 {
473 #ifdef XENPV
474 	/* dom0 does not require the arch-dependent P2M translation table */
475 	if (!xendomain_is_dom0())
476 		update_p2m_frame_list_list();
477 #endif
478 }
479 
480 /*
481  * idle_block()
482  *
483  *	Called from the idle loop when we have nothing to do but wait
484  *	for an interrupt.
485  */
486 static void
487 idle_block(void)
488 {
489 	KASSERT(curcpu()->ci_ipending == 0);
490 	HYPERVISOR_block();
491 	KASSERT(curcpu()->ci_ipending == 0);
492 }
493 
494 void
495 x86_cpu_idle_xen(void)
496 {
497 	struct cpu_info *ci = curcpu();
498 
499 	KASSERT(ci->ci_ilevel == IPL_NONE);
500 
501 	x86_disable_intr();
502 	if (__predict_false(!ci->ci_want_resched)) {
503 		idle_block();
504 	} else {
505 		x86_enable_intr();
506 	}
507 }
508 
509 #ifdef XENPV
510 /*
511  * Generate the p2m_frame_list_list table,
512  * needed for guest save/restore
513  */
514 static void
515 build_p2m_frame_list_list(void)
516 {
517         int fpp; /* number of page (frame) pointer per page */
518         unsigned long max_pfn;
519         /*
520          * The p2m list is composed of three levels of indirection,
521          * each layer containing MFNs pointing to lower level pages
522          * The indirection is used to convert a given PFN to its MFN
523          * Each N level page can point to @fpp (N-1) level pages
524          * For example, for x86 32bit, we have:
525          * - PAGE_SIZE: 4096 bytes
526          * - fpp: 1024 (one L3 page can address 1024 L2 pages)
527          * A L1 page contains the list of MFN we are looking for
528          */
529         max_pfn = xen_start_info.nr_pages;
530         fpp = PAGE_SIZE / sizeof(xen_pfn_t);
531 
532         /* we only need one L3 page */
533         l3_p2m_page = (vaddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE,
534 	    PAGE_SIZE, UVM_KMF_WIRED | UVM_KMF_NOWAIT);
535         if (l3_p2m_page == NULL)
536                 panic("could not allocate memory for l3_p2m_page");
537 
538         /*
539          * Determine how many L2 pages we need for the mapping
540          * Each L2 can map a total of @fpp L1 pages
541          */
542         l2_p2m_page_size = howmany(max_pfn, fpp);
543 
544         l2_p2m_page = (vaddr_t *)uvm_km_alloc(kernel_map,
545 	    l2_p2m_page_size * PAGE_SIZE,
546 	    PAGE_SIZE, UVM_KMF_WIRED | UVM_KMF_NOWAIT);
547         if (l2_p2m_page == NULL)
548                 panic("could not allocate memory for l2_p2m_page");
549 
550         /* We now have L3 and L2 pages ready, update L1 mapping */
551         update_p2m_frame_list_list();
552 
553 }
554 
555 /*
556  * Update the L1 p2m_frame_list_list mapping (during guest boot or resume)
557  */
558 static void
559 update_p2m_frame_list_list(void)
560 {
561         int i;
562         int fpp; /* number of page (frame) pointer per page */
563         unsigned long max_pfn;
564 
565         max_pfn = xen_start_info.nr_pages;
566         fpp = PAGE_SIZE / sizeof(xen_pfn_t);
567 
568         for (i = 0; i < l2_p2m_page_size; i++) {
569                 /*
570                  * Each time we start a new L2 page,
571                  * store its MFN in the L3 page
572                  */
573                 if ((i % fpp) == 0) {
574                         l3_p2m_page[i/fpp] = vtomfn(
575                                 (vaddr_t)&l2_p2m_page[i]);
576                 }
577                 /*
578                  * we use a shortcut
579                  * since @xpmap_phys_to_machine_mapping array
580                  * already contains PFN to MFN mapping, we just
581                  * set the l2_p2m_page MFN pointer to the MFN of the
582                  * according frame of @xpmap_phys_to_machine_mapping
583                  */
584                 l2_p2m_page[i] = vtomfn((vaddr_t)
585                         &xpmap_phys_to_machine_mapping[i*fpp]);
586         }
587 
588         HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
589                                         vtomfn((vaddr_t)l3_p2m_page);
590         HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
591 
592 }
593 #endif /* XENPV */
594 
595 void
596 xen_init_ksyms(void)
597 {
598 #if NKSYMS || defined(DDB) || defined(MODULAR)
599 	extern int end;
600 	extern int *esym;
601 #ifdef DDB
602 	db_machine_init();
603 #endif
604 
605 #ifdef XENPV
606 	esym = xen_start_info.mod_start ?
607 	    (void *)xen_start_info.mod_start :
608 	    (void *)xen_start_info.mfn_list;
609 #endif /* XENPV */
610 	/* for PVH, esym is set in locore.S */
611 	ksyms_addsyms_elf(*(int *)(void *)&end,
612 	    ((int *)(void *)&end) + 1, esym);
613 #endif
614 }
615