1 /* $NetBSD: hypervisor_machdep.c,v 1.12 2009/07/29 12:02:08 cegger Exp $ */ 2 3 /* 4 * 5 * Copyright (c) 2004 Christian Limpach. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by Christian Limpach. 19 * 4. The name of the author may not be used to endorse or promote products 20 * derived from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 23 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 24 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 25 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 27 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 31 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /****************************************************************************** 35 * hypervisor.c 36 * 37 * Communication to/from hypervisor. 38 * 39 * Copyright (c) 2002-2004, K A Fraser 40 * 41 * Permission is hereby granted, free of charge, to any person obtaining a copy 42 * of this software and associated documentation files (the "Software"), to 43 * deal in the Software without restriction, including without limitation the 44 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 45 * sell copies of the Software, and to permit persons to whom the Software is 46 * furnished to do so, subject to the following conditions: 47 * 48 * The above copyright notice and this permission notice shall be included in 49 * all copies or substantial portions of the Software. 50 * 51 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 52 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 53 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 54 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 55 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 56 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 57 * DEALINGS IN THE SOFTWARE. 58 */ 59 60 61 #include <sys/cdefs.h> 62 __KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.12 2009/07/29 12:02:08 cegger Exp $"); 63 64 #include <sys/param.h> 65 #include <sys/systm.h> 66 #include <sys/kmem.h> 67 68 #include <uvm/uvm_extern.h> 69 70 #include <machine/vmparam.h> 71 #include <machine/pmap.h> 72 73 #include <xen/xen.h> 74 #include <xen/hypervisor.h> 75 #include <xen/evtchn.h> 76 #include <xen/xenpmap.h> 77 78 #include "opt_xen.h" 79 80 /* 81 * arch-dependent p2m frame lists list (L3 and L2) 82 * used by Xen for save/restore mappings 83 */ 84 static unsigned long * l3_p2m_page; 85 static unsigned long * l2_p2m_page; 86 static int l2_p2m_page_size; /* size of L2 page, in pages */ 87 88 static void build_p2m_frame_list_list(void); 89 static void update_p2m_frame_list_list(void); 90 91 // #define PORT_DEBUG 4 92 // #define EARLY_DEBUG_EVENT 93 94 int stipending(void); 95 int 96 stipending(void) 97 { 98 unsigned long l1; 99 unsigned long l2; 100 unsigned int l1i, l2i, port; 101 volatile shared_info_t *s = HYPERVISOR_shared_info; 102 struct cpu_info *ci; 103 volatile struct vcpu_info *vci; 104 int ret; 105 106 ret = 0; 107 ci = curcpu(); 108 vci = ci->ci_vcpu; 109 110 #if 0 111 if (HYPERVISOR_shared_info->events) 112 printf("stipending events %08lx mask %08lx ilevel %d\n", 113 HYPERVISOR_shared_info->events, 114 HYPERVISOR_shared_info->events_mask, ci->ci_ilevel); 115 #endif 116 117 #ifdef EARLY_DEBUG_EVENT 118 if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) { 119 xen_debug_handler(NULL); 120 xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port); 121 } 122 #endif 123 124 /* 125 * we're only called after STIC, so we know that we'll have to 126 * STI at the end 127 */ 128 while (vci->evtchn_upcall_pending) { 129 cli(); 130 vci->evtchn_upcall_pending = 0; 131 /* NB. No need for a barrier here -- XCHG is a barrier 132 * on x86. */ 133 l1 = xen_atomic_xchg(&vci->evtchn_pending_sel, 0); 134 while ((l1i = xen_ffs(l1)) != 0) { 135 l1i--; 136 l1 &= ~(1UL << l1i); 137 138 l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i]; 139 /* 140 * mask and clear event. More efficient than calling 141 * hypervisor_mask/clear_event for each event. 142 */ 143 xen_atomic_setbits_l(&s->evtchn_mask[l1i], l2); 144 xen_atomic_clearbits_l(&s->evtchn_pending[l1i], l2); 145 while ((l2i = xen_ffs(l2)) != 0) { 146 l2i--; 147 l2 &= ~(1UL << l2i); 148 149 port = (l1i << LONG_SHIFT) + l2i; 150 if (evtsource[port]) { 151 hypervisor_set_ipending( 152 evtsource[port]->ev_imask, 153 l1i, l2i); 154 evtsource[port]->ev_evcnt.ev_count++; 155 if (ret == 0 && ci->ci_ilevel < 156 evtsource[port]->ev_maxlevel) 157 ret = 1; 158 } 159 #ifdef DOM0OPS 160 else { 161 /* set pending event */ 162 xenevt_setipending(l1i, l2i); 163 } 164 #endif 165 } 166 } 167 sti(); 168 } 169 170 #if 0 171 if (ci->ci_ipending & 0x1) 172 printf("stipending events %08lx mask %08lx ilevel %d ipending %08x\n", 173 HYPERVISOR_shared_info->events, 174 HYPERVISOR_shared_info->events_mask, ci->ci_ilevel, 175 ci->ci_ipending); 176 #endif 177 178 return (ret); 179 } 180 181 void 182 do_hypervisor_callback(struct intrframe *regs) 183 { 184 unsigned long l1; 185 unsigned long l2; 186 unsigned int l1i, l2i, port; 187 volatile shared_info_t *s = HYPERVISOR_shared_info; 188 struct cpu_info *ci; 189 volatile struct vcpu_info *vci; 190 int level; 191 192 ci = curcpu(); 193 vci = ci->ci_vcpu; 194 level = ci->ci_ilevel; 195 196 // DDD printf("do_hypervisor_callback\n"); 197 198 #ifdef EARLY_DEBUG_EVENT 199 if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) { 200 xen_debug_handler(NULL); 201 xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port); 202 } 203 #endif 204 205 while (vci->evtchn_upcall_pending) { 206 vci->evtchn_upcall_pending = 0; 207 /* NB. No need for a barrier here -- XCHG is a barrier 208 * on x86. */ 209 l1 = xen_atomic_xchg(&vci->evtchn_pending_sel, 0); 210 while ((l1i = xen_ffs(l1)) != 0) { 211 l1i--; 212 l1 &= ~(1UL << l1i); 213 214 l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i]; 215 /* 216 * mask and clear the pending events. 217 * Doing it here for all event that will be processed 218 * avoids a race with stipending (which can be called 219 * though evtchn_do_event->splx) that could cause an 220 * event to be both processed and marked pending. 221 */ 222 xen_atomic_setbits_l(&s->evtchn_mask[l1i], l2); 223 xen_atomic_clearbits_l(&s->evtchn_pending[l1i], l2); 224 225 while ((l2i = xen_ffs(l2)) != 0) { 226 l2i--; 227 l2 &= ~(1UL << l2i); 228 229 port = (l1i << LONG_SHIFT) + l2i; 230 #ifdef PORT_DEBUG 231 if (port == PORT_DEBUG) 232 printf("do_hypervisor_callback event %d\n", port); 233 #endif 234 if (evtsource[port]) 235 call_evtchn_do_event(port, regs); 236 #ifdef DOM0OPS 237 else { 238 if (ci->ci_ilevel < IPL_HIGH) { 239 /* fast path */ 240 int oipl = ci->ci_ilevel; 241 ci->ci_ilevel = IPL_HIGH; 242 call_xenevt_event(port); 243 ci->ci_ilevel = oipl; 244 } else { 245 /* set pending event */ 246 xenevt_setipending(l1i, l2i); 247 } 248 } 249 #endif 250 } 251 } 252 } 253 254 #ifdef DIAGNOSTIC 255 if (level != ci->ci_ilevel) 256 printf("hypervisor done %08x level %d/%d ipending %08x\n", 257 (uint)vci->evtchn_pending_sel, 258 level, ci->ci_ilevel, ci->ci_ipending); 259 #endif 260 } 261 262 void 263 hypervisor_unmask_event(unsigned int ev) 264 { 265 volatile shared_info_t *s = HYPERVISOR_shared_info; 266 volatile struct vcpu_info *vci = curcpu()->ci_vcpu; 267 268 #ifdef PORT_DEBUG 269 if (ev == PORT_DEBUG) 270 printf("hypervisor_unmask_event %d\n", ev); 271 #endif 272 273 xen_atomic_clear_bit(&s->evtchn_mask[0], ev); 274 /* 275 * The following is basically the equivalent of 276 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose the 277 * interrupt edge' if the channel is masked. 278 */ 279 if (xen_atomic_test_bit(&s->evtchn_pending[0], ev) && 280 !xen_atomic_test_and_set_bit(&vci->evtchn_pending_sel, ev>>LONG_SHIFT)) { 281 xen_atomic_set_bit(&vci->evtchn_upcall_pending, 0); 282 if (!vci->evtchn_upcall_mask) 283 hypervisor_force_callback(); 284 } 285 } 286 287 void 288 hypervisor_mask_event(unsigned int ev) 289 { 290 volatile shared_info_t *s = HYPERVISOR_shared_info; 291 #ifdef PORT_DEBUG 292 if (ev == PORT_DEBUG) 293 printf("hypervisor_mask_event %d\n", ev); 294 #endif 295 296 xen_atomic_set_bit(&s->evtchn_mask[0], ev); 297 } 298 299 void 300 hypervisor_clear_event(unsigned int ev) 301 { 302 volatile shared_info_t *s = HYPERVISOR_shared_info; 303 #ifdef PORT_DEBUG 304 if (ev == PORT_DEBUG) 305 printf("hypervisor_clear_event %d\n", ev); 306 #endif 307 308 xen_atomic_clear_bit(&s->evtchn_pending[0], ev); 309 } 310 311 void 312 hypervisor_enable_ipl(unsigned int ipl) 313 { 314 u_long l1, l2; 315 int l1i, l2i; 316 struct cpu_info *ci = curcpu(); 317 318 /* 319 * enable all events for ipl. As we only set an event in ipl_evt_mask 320 * for its lowest IPL, and pending IPLs are processed high to low, 321 * we know that all callback for this event have been processed. 322 */ 323 324 l1 = ci->ci_isources[ipl]->ipl_evt_mask1; 325 ci->ci_isources[ipl]->ipl_evt_mask1 = 0; 326 while ((l1i = xen_ffs(l1)) != 0) { 327 l1i--; 328 l1 &= ~(1UL << l1i); 329 l2 = ci->ci_isources[ipl]->ipl_evt_mask2[l1i]; 330 ci->ci_isources[ipl]->ipl_evt_mask2[l1i] = 0; 331 while ((l2i = xen_ffs(l2)) != 0) { 332 int evtch; 333 334 l2i--; 335 l2 &= ~(1UL << l2i); 336 337 evtch = (l1i << LONG_SHIFT) + l2i; 338 hypervisor_enable_event(evtch); 339 } 340 } 341 } 342 343 void 344 hypervisor_set_ipending(uint32_t iplmask, int l1, int l2) 345 { 346 int ipl; 347 struct cpu_info *ci = curcpu(); 348 349 /* set pending bit for the appropriate IPLs */ 350 ci->ci_ipending |= iplmask; 351 352 /* 353 * And set event pending bit for the lowest IPL. As IPL are handled 354 * from high to low, this ensure that all callbacks will have been 355 * called when we ack the event 356 */ 357 ipl = ffs(iplmask); 358 KASSERT(ipl > 0); 359 ipl--; 360 ci->ci_isources[ipl]->ipl_evt_mask1 |= 1UL << l1; 361 ci->ci_isources[ipl]->ipl_evt_mask2[l1] |= 1UL << l2; 362 } 363 364 void 365 hypervisor_machdep_attach(void) 366 { 367 /* dom0 does not require the arch-dependent P2M translation table */ 368 if ( !xendomain_is_dom0() ) { 369 build_p2m_frame_list_list(); 370 } 371 } 372 373 /* 374 * Generate the p2m_frame_list_list table, 375 * needed for guest save/restore 376 */ 377 static void 378 build_p2m_frame_list_list(void) 379 { 380 int fpp; /* number of page (frame) pointer per page */ 381 unsigned long max_pfn; 382 /* 383 * The p2m list is composed of three levels of indirection, 384 * each layer containing MFNs pointing to lower level pages 385 * The indirection is used to convert a given PFN to its MFN 386 * Each N level page can point to @fpp (N-1) level pages 387 * For example, for x86 32bit, we have: 388 * - PAGE_SIZE: 4096 bytes 389 * - fpp: 1024 (one L3 page can address 1024 L2 pages) 390 * A L1 page contains the list of MFN we are looking for 391 */ 392 max_pfn = xen_start_info.nr_pages; 393 fpp = PAGE_SIZE / sizeof(paddr_t); 394 395 /* we only need one L3 page */ 396 l3_p2m_page = kmem_alloc(PAGE_SIZE, KM_NOSLEEP); 397 if (l3_p2m_page == NULL) 398 panic("could not allocate memory for l3_p2m_page"); 399 400 /* 401 * Determine how many L2 pages we need for the mapping 402 * Each L2 can map a total of @fpp L1 pages 403 */ 404 l2_p2m_page_size = howmany(max_pfn, fpp); 405 406 l2_p2m_page = kmem_alloc(l2_p2m_page_size * PAGE_SIZE, KM_NOSLEEP); 407 if (l2_p2m_page == NULL) 408 panic("could not allocate memory for l2_p2m_page"); 409 410 /* We now have L3 and L2 pages ready, update L1 mapping */ 411 update_p2m_frame_list_list(); 412 413 } 414 415 /* 416 * Update the L1 p2m_frame_list_list mapping (during guest boot or resume) 417 */ 418 static void 419 update_p2m_frame_list_list(void) 420 { 421 int i; 422 int fpp; /* number of page (frame) pointer per page */ 423 unsigned long max_pfn; 424 425 max_pfn = xen_start_info.nr_pages; 426 fpp = PAGE_SIZE / sizeof(paddr_t); 427 428 for (i = 0; i < l2_p2m_page_size; i++) { 429 /* 430 * Each time we start a new L2 page, 431 * store its MFN in the L3 page 432 */ 433 if ((i % fpp) == 0) { 434 l3_p2m_page[i/fpp] = vtomfn( 435 (vaddr_t)&l2_p2m_page[i]); 436 } 437 /* 438 * we use a shortcut 439 * since @xpmap_phys_to_machine_mapping array 440 * already contains PFN to MFN mapping, we just 441 * set the l2_p2m_page MFN pointer to the MFN of the 442 * according frame of @xpmap_phys_to_machine_mapping 443 */ 444 l2_p2m_page[i] = vtomfn((vaddr_t) 445 &xpmap_phys_to_machine_mapping[i*fpp]); 446 } 447 448 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = 449 vtomfn((vaddr_t)l3_p2m_page); 450 HYPERVISOR_shared_info->arch.max_pfn = max_pfn; 451 452 } 453