1 /* $NetBSD: hypervisor_machdep.c,v 1.39 2020/05/02 16:44:36 bouyer Exp $ */ 2 3 /* 4 * 5 * Copyright (c) 2004 Christian Limpach. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /****************************************************************************** 30 * hypervisor.c 31 * 32 * Communication to/from hypervisor. 33 * 34 * Copyright (c) 2002-2004, K A Fraser 35 * 36 * Permission is hereby granted, free of charge, to any person obtaining a copy 37 * of this software and associated documentation files (the "Software"), to 38 * deal in the Software without restriction, including without limitation the 39 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 40 * sell copies of the Software, and to permit persons to whom the Software is 41 * furnished to do so, subject to the following conditions: 42 * 43 * The above copyright notice and this permission notice shall be included in 44 * all copies or substantial portions of the Software. 45 * 46 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 47 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 48 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 49 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 50 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 51 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 52 * DEALINGS IN THE SOFTWARE. 53 */ 54 55 56 #include <sys/cdefs.h> 57 __KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.39 2020/05/02 16:44:36 bouyer Exp $"); 58 59 #include <sys/param.h> 60 #include <sys/systm.h> 61 #include <sys/kmem.h> 62 #include <sys/cpu.h> 63 #include <sys/ksyms.h> 64 65 #include <uvm/uvm_extern.h> 66 67 #include <machine/vmparam.h> 68 #include <machine/pmap.h> 69 70 #include <x86/machdep.h> 71 #include <x86/cpuvar.h> 72 73 #include <xen/xen.h> 74 #include <xen/intr.h> 75 #include <xen/hypervisor.h> 76 #include <xen/evtchn.h> 77 #include <xen/xenpmap.h> 78 79 #include "opt_xen.h" 80 #include "opt_modular.h" 81 #include "opt_ddb.h" 82 #include "isa.h" 83 #include "pci.h" 84 #include "ksyms.h" 85 86 #ifdef DDB 87 #include <machine/db_machdep.h> 88 #include <ddb/db_extern.h> 89 #include <ddb/db_output.h> 90 #include <ddb/db_interface.h> 91 #endif 92 93 #ifdef XENPV 94 /* 95 * arch-dependent p2m frame lists list (L3 and L2) 96 * used by Xen for save/restore mappings 97 */ 98 static unsigned long * l3_p2m_page; 99 static unsigned long * l2_p2m_page; 100 static int l2_p2m_page_size; /* size of L2 page, in pages */ 101 102 static void build_p2m_frame_list_list(void); 103 static void update_p2m_frame_list_list(void); 104 105 #endif 106 107 // #define PORT_DEBUG 4 108 // #define EARLY_DEBUG_EVENT 109 110 /* callback function type */ 111 typedef void (*iterate_func_t)(unsigned int, unsigned int, 112 unsigned int, void *); 113 114 static inline void 115 evt_iterate_bits(volatile unsigned long *pendingl1, 116 volatile unsigned long *pendingl2, 117 volatile unsigned long *mask, 118 iterate_func_t iterate_pending, void *iterate_args) 119 { 120 121 KASSERT(pendingl1 != NULL); 122 KASSERT(pendingl2 != NULL); 123 124 unsigned long l1, l2; 125 unsigned int l1i, l2i, port; 126 127 l1 = xen_atomic_xchg(pendingl1, 0); 128 while ((l1i = xen_ffs(l1)) != 0) { 129 l1i--; 130 l1 &= ~(1UL << l1i); 131 132 l2 = pendingl2[l1i] & (mask != NULL ? ~mask[l1i] : -1UL); 133 l2 &= curcpu()->ci_evtmask[l1i]; 134 135 if (mask != NULL) xen_atomic_setbits_l(&mask[l1i], l2); 136 xen_atomic_clearbits_l(&pendingl2[l1i], l2); 137 138 while ((l2i = xen_ffs(l2)) != 0) { 139 l2i--; 140 l2 &= ~(1UL << l2i); 141 142 port = (l1i << LONG_SHIFT) + l2i; 143 144 iterate_pending(port, l1i, l2i, iterate_args); 145 } 146 } 147 } 148 149 /* 150 * Set per-cpu "pending" information for outstanding events that 151 * cannot be processed now. 152 */ 153 154 static inline void 155 evt_set_pending(unsigned int port, unsigned int l1i, 156 unsigned int l2i, void *args) 157 { 158 159 KASSERT(args != NULL); 160 161 int *ret = args; 162 163 if (evtsource[port]) { 164 hypervisor_set_ipending(evtsource[port]->ev_imask, l1i, l2i); 165 evtsource[port]->ev_evcnt.ev_count++; 166 if (*ret == 0 && curcpu()->ci_ilevel < 167 evtsource[port]->ev_maxlevel) 168 *ret = 1; 169 } 170 #ifdef DOM0OPS 171 else { 172 /* set pending event */ 173 xenevt_setipending(l1i, l2i); 174 } 175 #endif 176 } 177 178 int stipending(void); 179 int 180 stipending(void) 181 { 182 volatile shared_info_t *s = HYPERVISOR_shared_info; 183 struct cpu_info *ci; 184 volatile struct vcpu_info *vci; 185 int ret; 186 187 ret = 0; 188 ci = curcpu(); 189 vci = ci->ci_vcpu; 190 191 #if 0 192 if (HYPERVISOR_shared_info->events) 193 printf("stipending events %08lx mask %08lx ilevel %d\n", 194 HYPERVISOR_shared_info->events, 195 HYPERVISOR_shared_info->events_mask, ci->ci_ilevel); 196 #endif 197 198 #ifdef EARLY_DEBUG_EVENT 199 if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) { 200 xen_debug_handler(NULL); 201 xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port); 202 } 203 #endif 204 205 /* 206 * we're only called after STIC, so we know that we'll have to 207 * STI at the end 208 */ 209 210 while (vci->evtchn_upcall_pending) { 211 x86_disable_intr(); 212 213 vci->evtchn_upcall_pending = 0; 214 215 evt_iterate_bits(&vci->evtchn_pending_sel, 216 s->evtchn_pending, s->evtchn_mask, 217 evt_set_pending, &ret); 218 219 x86_enable_intr(); 220 } 221 222 return (ret); 223 } 224 225 /* Iterate through pending events and call the event handler */ 226 227 static inline void 228 evt_do_hypervisor_callback(unsigned int port, unsigned int l1i, 229 unsigned int l2i, void *args) 230 { 231 KASSERT(args != NULL); 232 233 #ifdef DOM0OPS 234 struct cpu_info *ci = curcpu(); 235 #endif 236 struct intrframe *regs = args; 237 238 #ifdef PORT_DEBUG 239 if (port == PORT_DEBUG) 240 printf("do_hypervisor_callback event %d\n", port); 241 #endif 242 if (evtsource[port]) { 243 KASSERT(cpu_intr_p()); 244 evtchn_do_event(port, regs); 245 } 246 #ifdef DOM0OPS 247 else { 248 if (ci->ci_ilevel < IPL_HIGH) { 249 /* fast path */ 250 int oipl = ci->ci_ilevel; 251 ci->ci_ilevel = IPL_HIGH; 252 KASSERT(cpu_intr_p()); 253 xenevt_event(port); 254 ci->ci_ilevel = oipl; 255 } else { 256 /* set pending event */ 257 xenevt_setipending(l1i, l2i); 258 } 259 } 260 #endif 261 } 262 263 void 264 do_hypervisor_callback(struct intrframe *regs) 265 { 266 volatile shared_info_t *s = HYPERVISOR_shared_info; 267 struct cpu_info *ci; 268 volatile struct vcpu_info *vci; 269 int level __diagused; 270 271 ci = curcpu(); 272 vci = ci->ci_vcpu; 273 level = ci->ci_ilevel; 274 275 /* Save trapframe for clock handler */ 276 KASSERT(regs != NULL); 277 ci->ci_xen_clockf_usermode = USERMODE(regs->_INTRFRAME_CS); 278 ci->ci_xen_clockf_pc = regs->_INTRFRAME_IP; 279 280 // DDD printf("do_hypervisor_callback\n"); 281 282 #ifdef EARLY_DEBUG_EVENT 283 if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) { 284 xen_debug_handler(NULL); 285 xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port); 286 } 287 #endif 288 289 while (vci->evtchn_upcall_pending) { 290 vci->evtchn_upcall_pending = 0; 291 292 evt_iterate_bits(&vci->evtchn_pending_sel, 293 s->evtchn_pending, s->evtchn_mask, 294 evt_do_hypervisor_callback, regs); 295 } 296 297 #ifdef DIAGNOSTIC 298 if (level != ci->ci_ilevel) 299 printf("hypervisor done %08x level %d/%d ipending %08x\n", 300 (uint)vci->evtchn_pending_sel, 301 level, ci->ci_ilevel, ci->ci_ipending); 302 #endif 303 } 304 305 #if 0 306 void 307 hypervisor_send_event(struct cpu_info *ci, unsigned int ev) 308 { 309 KASSERT(ci != NULL); 310 311 volatile shared_info_t *s = HYPERVISOR_shared_info; 312 volatile struct vcpu_info *vci = ci->ci_vcpu; 313 314 #ifdef PORT_DEBUG 315 if (ev == PORT_DEBUG) 316 printf("hypervisor_send_event %d\n", ev); 317 #endif 318 319 xen_atomic_set_bit(&s->evtchn_pending[0], ev); 320 321 if (__predict_false(ci == curcpu())) { 322 xen_atomic_set_bit(&vci->evtchn_pending_sel, 323 ev >> LONG_SHIFT); 324 xen_atomic_set_bit(&vci->evtchn_upcall_pending, 0); 325 } 326 327 xen_atomic_clear_bit(&s->evtchn_mask[0], ev); 328 329 if (__predict_true(ci == curcpu())) { 330 hypervisor_force_callback(); 331 } else { 332 if (__predict_false(xen_send_ipi(ci, XEN_IPI_HVCB))) { 333 panic("xen_send_ipi(cpu%d id %d, XEN_IPI_HVCB) failed\n", 334 (int) ci->ci_cpuid, ci->ci_vcpuid); 335 } 336 } 337 } 338 #endif 339 340 void 341 hypervisor_unmask_event(unsigned int ev) 342 { 343 344 KASSERT(ev > 0 && ev < NR_EVENT_CHANNELS); 345 346 #ifdef PORT_DEBUG 347 if (ev == PORT_DEBUG) 348 printf("hypervisor_unmask_event %d\n", ev); 349 #endif 350 351 /* Xen unmasks the evtchn_mask[0]:ev bit for us. */ 352 evtchn_op_t op; 353 op.cmd = EVTCHNOP_unmask; 354 op.u.unmask.port = ev; 355 if (HYPERVISOR_event_channel_op(&op) != 0) 356 panic("Failed to unmask event %d\n", ev); 357 358 return; 359 } 360 361 void 362 hypervisor_mask_event(unsigned int ev) 363 { 364 volatile shared_info_t *s = HYPERVISOR_shared_info; 365 #ifdef PORT_DEBUG 366 if (ev == PORT_DEBUG) 367 printf("hypervisor_mask_event %d\n", ev); 368 #endif 369 370 xen_atomic_set_bit(&s->evtchn_mask[0], ev); 371 } 372 373 void 374 hypervisor_clear_event(unsigned int ev) 375 { 376 volatile shared_info_t *s = HYPERVISOR_shared_info; 377 #ifdef PORT_DEBUG 378 if (ev == PORT_DEBUG) 379 printf("hypervisor_clear_event %d\n", ev); 380 #endif 381 382 xen_atomic_clear_bit(&s->evtchn_pending[0], ev); 383 } 384 385 static inline void 386 evt_enable_event(unsigned int port, unsigned int l1i, 387 unsigned int l2i, void *args) 388 { 389 KASSERT(args == NULL); 390 hypervisor_unmask_event(port); 391 } 392 393 void 394 hypervisor_enable_sir(unsigned int sir) 395 { 396 struct cpu_info *ci = curcpu(); 397 398 /* 399 * enable all events for ipl. As we only set an event in ipl_evt_mask 400 * for its lowest IPL, and pending IPLs are processed high to low, 401 * we know that all callback for this event have been processed. 402 */ 403 404 evt_iterate_bits(&ci->ci_isources[sir]->ipl_evt_mask1, 405 ci->ci_isources[sir]->ipl_evt_mask2, NULL, 406 evt_enable_event, NULL); 407 408 } 409 410 void 411 hypervisor_set_ipending(uint32_t imask, int l1, int l2) 412 { 413 414 /* This function is not re-entrant */ 415 KASSERT(x86_read_psl() != 0); 416 417 int sir; 418 struct cpu_info *ci = curcpu(); 419 420 /* set pending bit for the appropriate IPLs */ 421 ci->ci_ipending |= imask; 422 423 /* 424 * And set event pending bit for the lowest IPL. As IPL are handled 425 * from high to low, this ensure that all callbacks will have been 426 * called when we ack the event 427 */ 428 sir = ffs(imask); 429 KASSERT(sir > SIR_XENIPL_VM); 430 sir--; 431 KASSERT(sir <= SIR_XENIPL_HIGH); 432 KASSERT(ci->ci_isources[sir] != NULL); 433 ci->ci_isources[sir]->ipl_evt_mask1 |= 1UL << l1; 434 ci->ci_isources[sir]->ipl_evt_mask2[l1] |= 1UL << l2; 435 KASSERT(ci == curcpu()); 436 #if 0 437 if (__predict_false(ci != curcpu())) { 438 if (xen_send_ipi(ci, XEN_IPI_HVCB)) { 439 panic("hypervisor_set_ipending: " 440 "xen_send_ipi(cpu%d id %d, XEN_IPI_HVCB) failed\n", 441 (int) ci->ci_cpuid, ci->ci_vcpuid); 442 } 443 } 444 #endif 445 } 446 447 void 448 hypervisor_machdep_attach(void) 449 { 450 #ifdef XENPV 451 /* dom0 does not require the arch-dependent P2M translation table */ 452 if (!xendomain_is_dom0()) { 453 build_p2m_frame_list_list(); 454 sysctl_xen_suspend_setup(); 455 } 456 #endif 457 } 458 459 void 460 hypervisor_machdep_resume(void) 461 { 462 #ifdef XENPV 463 /* dom0 does not require the arch-dependent P2M translation table */ 464 if (!xendomain_is_dom0()) 465 update_p2m_frame_list_list(); 466 #endif 467 } 468 469 /* 470 * idle_block() 471 * 472 * Called from the idle loop when we have nothing to do but wait 473 * for an interrupt. 474 */ 475 static void 476 idle_block(void) 477 { 478 KASSERT(curcpu()->ci_ipending == 0); 479 HYPERVISOR_block(); 480 KASSERT(curcpu()->ci_ipending == 0); 481 } 482 483 void 484 x86_cpu_idle_xen(void) 485 { 486 struct cpu_info *ci = curcpu(); 487 488 KASSERT(ci->ci_ilevel == IPL_NONE); 489 490 x86_disable_intr(); 491 if (__predict_false(!ci->ci_want_resched)) { 492 idle_block(); 493 } else { 494 x86_enable_intr(); 495 } 496 } 497 498 #ifdef XENPV 499 /* 500 * Generate the p2m_frame_list_list table, 501 * needed for guest save/restore 502 */ 503 static void 504 build_p2m_frame_list_list(void) 505 { 506 int fpp; /* number of page (frame) pointer per page */ 507 unsigned long max_pfn; 508 /* 509 * The p2m list is composed of three levels of indirection, 510 * each layer containing MFNs pointing to lower level pages 511 * The indirection is used to convert a given PFN to its MFN 512 * Each N level page can point to @fpp (N-1) level pages 513 * For example, for x86 32bit, we have: 514 * - PAGE_SIZE: 4096 bytes 515 * - fpp: 1024 (one L3 page can address 1024 L2 pages) 516 * A L1 page contains the list of MFN we are looking for 517 */ 518 max_pfn = xen_start_info.nr_pages; 519 fpp = PAGE_SIZE / sizeof(xen_pfn_t); 520 521 /* we only need one L3 page */ 522 l3_p2m_page = (vaddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 523 PAGE_SIZE, UVM_KMF_WIRED | UVM_KMF_NOWAIT); 524 if (l3_p2m_page == NULL) 525 panic("could not allocate memory for l3_p2m_page"); 526 527 /* 528 * Determine how many L2 pages we need for the mapping 529 * Each L2 can map a total of @fpp L1 pages 530 */ 531 l2_p2m_page_size = howmany(max_pfn, fpp); 532 533 l2_p2m_page = (vaddr_t *)uvm_km_alloc(kernel_map, 534 l2_p2m_page_size * PAGE_SIZE, 535 PAGE_SIZE, UVM_KMF_WIRED | UVM_KMF_NOWAIT); 536 if (l2_p2m_page == NULL) 537 panic("could not allocate memory for l2_p2m_page"); 538 539 /* We now have L3 and L2 pages ready, update L1 mapping */ 540 update_p2m_frame_list_list(); 541 542 } 543 544 /* 545 * Update the L1 p2m_frame_list_list mapping (during guest boot or resume) 546 */ 547 static void 548 update_p2m_frame_list_list(void) 549 { 550 int i; 551 int fpp; /* number of page (frame) pointer per page */ 552 unsigned long max_pfn; 553 554 max_pfn = xen_start_info.nr_pages; 555 fpp = PAGE_SIZE / sizeof(xen_pfn_t); 556 557 for (i = 0; i < l2_p2m_page_size; i++) { 558 /* 559 * Each time we start a new L2 page, 560 * store its MFN in the L3 page 561 */ 562 if ((i % fpp) == 0) { 563 l3_p2m_page[i/fpp] = vtomfn( 564 (vaddr_t)&l2_p2m_page[i]); 565 } 566 /* 567 * we use a shortcut 568 * since @xpmap_phys_to_machine_mapping array 569 * already contains PFN to MFN mapping, we just 570 * set the l2_p2m_page MFN pointer to the MFN of the 571 * according frame of @xpmap_phys_to_machine_mapping 572 */ 573 l2_p2m_page[i] = vtomfn((vaddr_t) 574 &xpmap_phys_to_machine_mapping[i*fpp]); 575 } 576 577 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = 578 vtomfn((vaddr_t)l3_p2m_page); 579 HYPERVISOR_shared_info->arch.max_pfn = max_pfn; 580 581 } 582 #endif /* XENPV */ 583 584 void 585 xen_init_ksyms(void) 586 { 587 #if NKSYMS || defined(DDB) || defined(MODULAR) 588 extern int end; 589 extern int *esym; 590 #ifdef DDB 591 db_machine_init(); 592 #endif 593 594 #ifdef XENPV 595 esym = xen_start_info.mod_start ? 596 (void *)xen_start_info.mod_start : 597 (void *)xen_start_info.mfn_list; 598 #endif /* XENPV */ 599 /* for PVH, esym is set in locore.S */ 600 ksyms_addsyms_elf(*(int *)(void *)&end, 601 ((int *)(void *)&end) + 1, esym); 602 #endif 603 } 604