15084Sjohnlev /* 25084Sjohnlev * CDDL HEADER START 35084Sjohnlev * 45084Sjohnlev * The contents of this file are subject to the terms of the 55084Sjohnlev * Common Development and Distribution License (the "License"). 65084Sjohnlev * You may not use this file except in compliance with the License. 75084Sjohnlev * 85084Sjohnlev * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 95084Sjohnlev * or http://www.opensolaris.org/os/licensing. 105084Sjohnlev * See the License for the specific language governing permissions 115084Sjohnlev * and limitations under the License. 125084Sjohnlev * 135084Sjohnlev * When distributing Covered Code, include this CDDL HEADER in each 145084Sjohnlev * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 155084Sjohnlev * If applicable, add the following below this CDDL HEADER, with the 165084Sjohnlev * fields enclosed by brackets "[]" replaced with your own identifying 175084Sjohnlev * information: Portions Copyright [yyyy] [name of copyright owner] 185084Sjohnlev * 195084Sjohnlev * CDDL HEADER END 205084Sjohnlev */ 215084Sjohnlev 225084Sjohnlev /* 23*8679SSeth.Goldberg@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 245084Sjohnlev * Use is subject to license terms. 255084Sjohnlev */ 265084Sjohnlev 275084Sjohnlev #include <sys/types.h> 285084Sjohnlev #include <sys/clock.h> 295084Sjohnlev #include <sys/psm.h> 305084Sjohnlev #include <sys/archsystm.h> 315084Sjohnlev #include <sys/machsystm.h> 325084Sjohnlev #include <sys/compress.h> 335084Sjohnlev #include <sys/modctl.h> 345084Sjohnlev #include <sys/trap.h> 355084Sjohnlev #include <sys/panic.h> 365084Sjohnlev #include <sys/regset.h> 375084Sjohnlev #include <sys/frame.h> 385084Sjohnlev #include <sys/kobj.h> 395084Sjohnlev #include <sys/apic.h> 405084Sjohnlev #include <sys/dumphdr.h> 415084Sjohnlev #include <sys/mem.h> 425084Sjohnlev #include <sys/x86_archext.h> 435084Sjohnlev #include <sys/xpv_panic.h> 445084Sjohnlev #include <sys/boot_console.h> 455084Sjohnlev #include <sys/bootsvcs.h> 465084Sjohnlev #include <sys/consdev.h> 475084Sjohnlev #include <vm/hat_pte.h> 485084Sjohnlev #include <vm/hat_i86.h> 495084Sjohnlev 505084Sjohnlev /* XXX: need to add a PAE version too, if we ever support both PAE and non */ 515084Sjohnlev #if defined(__i386) 525084Sjohnlev #define XPV_FILENAME "/boot/xen-syms" 535084Sjohnlev #else 545084Sjohnlev #define XPV_FILENAME "/boot/amd64/xen-syms" 555084Sjohnlev #endif 565084Sjohnlev #define XPV_MODNAME "xpv" 575084Sjohnlev 585084Sjohnlev int xpv_panicking = 0; 595084Sjohnlev 605084Sjohnlev struct module *xpv_module; 615084Sjohnlev struct modctl *xpv_modctl; 625084Sjohnlev 635084Sjohnlev #define ALIGN(x, a) ((a) == 0 ? (uintptr_t)(x) : \ 645084Sjohnlev (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l))) 655084Sjohnlev 665084Sjohnlev /* Pointer to the xpv_panic_info structure handed to us by Xen. */ 675084Sjohnlev static struct panic_info *xpv_panic_info = NULL; 685084Sjohnlev 695084Sjohnlev /* Timer support */ 705084Sjohnlev #define NSEC_SHIFT 5 715084Sjohnlev #define T_XPV_TIMER 0xd1 725084Sjohnlev #define XPV_TIMER_INTERVAL 1000 /* 1000 microseconds */ 735084Sjohnlev static uint32_t *xpv_apicadr = NULL; 745084Sjohnlev static uint_t nsec_scale; 755084Sjohnlev 765084Sjohnlev /* IDT support */ 775084Sjohnlev #pragma align 16(xpv_panic_idt) 785084Sjohnlev static gate_desc_t xpv_panic_idt[NIDT]; /* interrupt descriptor table */ 795084Sjohnlev 805084Sjohnlev /* Xen pagetables mapped into our HAT's ptable windows */ 815084Sjohnlev static pfn_t ptable_pfn[MAX_NUM_LEVEL]; 825084Sjohnlev 835084Sjohnlev /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */ 845084Sjohnlev static int xpv_dump_pages; 855084Sjohnlev 865084Sjohnlev /* 876144Srab * There are up to two large swathes of RAM that we don't want to include 886144Srab * in the dump: those that comprise the Xen version of segkpm. On 32-bit 896144Srab * systems there is no such region of memory. On 64-bit systems, there 906144Srab * should be just a single contiguous region that corresponds to all of 916144Srab * physical memory. The tricky bit is that Xen's heap sometimes lives in 926144Srab * the middle of their segkpm, and is mapped using only kpm-like addresses. 936144Srab * In that case, we need to skip the swathes before and after Xen's heap. 946144Srab */ 956144Srab uintptr_t kpm1_low = 0; 966144Srab uintptr_t kpm1_high = 0; 976144Srab uintptr_t kpm2_low = 0; 986144Srab uintptr_t kpm2_high = 0; 996144Srab 1006144Srab /* 1015084Sjohnlev * Some commonly used values that we don't want to recompute over and over. 1025084Sjohnlev */ 1035084Sjohnlev static int xpv_panic_nptes[MAX_NUM_LEVEL]; 1045084Sjohnlev static ulong_t xpv_panic_cr3; 1055084Sjohnlev static uintptr_t xpv_end; 1065084Sjohnlev 1075084Sjohnlev static void xpv_panic_console_print(const char *fmt, ...); 1085084Sjohnlev static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print; 1095084Sjohnlev 1105084Sjohnlev #define CONSOLE_BUF_SIZE 256 1115084Sjohnlev static char console_buffer[CONSOLE_BUF_SIZE]; 1125084Sjohnlev static boolean_t use_polledio; 1135084Sjohnlev 1147532SSean.Ye@Sun.COM /* 1157532SSean.Ye@Sun.COM * Pointers to machine check panic info (if any). 1167532SSean.Ye@Sun.COM */ 1177532SSean.Ye@Sun.COM xpv_mca_panic_data_t *xpv_mca_panic_data = NULL; 1187532SSean.Ye@Sun.COM 1195084Sjohnlev static void 1205084Sjohnlev xpv_panic_putc(int m) 1215084Sjohnlev { 1225084Sjohnlev struct cons_polledio *c = cons_polledio; 1235084Sjohnlev 1245084Sjohnlev /* This really shouldn't happen */ 1255084Sjohnlev if (console == CONS_HYPERVISOR) 1265084Sjohnlev return; 1275084Sjohnlev 1285084Sjohnlev if (use_polledio == B_TRUE) 1295084Sjohnlev c->cons_polledio_putchar(c->cons_polledio_argument, m); 1305084Sjohnlev else 1315084Sjohnlev bcons_putchar(m); 1325084Sjohnlev } 1335084Sjohnlev 1345084Sjohnlev static void 1355084Sjohnlev xpv_panic_puts(char *msg) 1365084Sjohnlev { 1375084Sjohnlev char *m; 1385084Sjohnlev 1395084Sjohnlev dump_timeleft = dump_timeout; 1405084Sjohnlev for (m = msg; *m; m++) 1415084Sjohnlev xpv_panic_putc((int)*m); 1425084Sjohnlev } 1435084Sjohnlev 1445084Sjohnlev static void 1455084Sjohnlev xpv_panic_console_print(const char *fmt, ...) 1465084Sjohnlev { 1475084Sjohnlev va_list ap; 1485084Sjohnlev 1495084Sjohnlev va_start(ap, fmt); 1505084Sjohnlev (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap); 1515084Sjohnlev va_end(ap); 1525084Sjohnlev 1535084Sjohnlev xpv_panic_puts(console_buffer); 1545084Sjohnlev } 1555084Sjohnlev 1565084Sjohnlev static void 1575084Sjohnlev xpv_panic_map(int level, pfn_t pfn) 1585084Sjohnlev { 1595084Sjohnlev x86pte_t pte, *pteptr; 1605084Sjohnlev 1615084Sjohnlev /* 1625084Sjohnlev * The provided pfn represents a level 'level' page table. Map it 1635084Sjohnlev * into the 'level' slot in the list of page table windows. 1645084Sjohnlev */ 1655084Sjohnlev pteptr = (x86pte_t *)PWIN_PTE_VA(level); 1665084Sjohnlev pte = pfn_to_pa(pfn) | PT_VALID; 1675084Sjohnlev 1685084Sjohnlev XPV_ALLOW_PAGETABLE_UPDATES(); 1695084Sjohnlev if (mmu.pae_hat) 1705084Sjohnlev *pteptr = pte; 1715084Sjohnlev else 1725084Sjohnlev *(x86pte32_t *)pteptr = pte; 1735084Sjohnlev XPV_DISALLOW_PAGETABLE_UPDATES(); 1745084Sjohnlev 1755084Sjohnlev mmu_tlbflush_entry(PWIN_VA(level)); 1765084Sjohnlev } 1775084Sjohnlev 1785084Sjohnlev /* 1795084Sjohnlev * Walk the page tables to find the pfn mapped by the given va. 1805084Sjohnlev */ 1815084Sjohnlev static pfn_t 1825084Sjohnlev xpv_va_walk(uintptr_t *vaddr) 1835084Sjohnlev { 1845084Sjohnlev int l, idx; 1855084Sjohnlev pfn_t pfn; 1865084Sjohnlev x86pte_t pte; 1875084Sjohnlev x86pte_t *ptep; 1885084Sjohnlev uintptr_t va = *vaddr; 1895084Sjohnlev uintptr_t scan_va; 1905084Sjohnlev caddr_t ptable_window; 1915084Sjohnlev static pfn_t toplevel_pfn; 1925084Sjohnlev static uintptr_t lastva; 1935084Sjohnlev 1945084Sjohnlev /* 1955084Sjohnlev * If we do anything other than a simple scan through memory, don't 1965084Sjohnlev * trust the mapped page tables. 1975084Sjohnlev */ 1985084Sjohnlev if (va != lastva + MMU_PAGESIZE) 1995084Sjohnlev for (l = mmu.max_level; l >= 0; l--) 2005084Sjohnlev ptable_pfn[l] = PFN_INVALID; 2015084Sjohnlev 2025084Sjohnlev toplevel_pfn = mmu_btop(xpv_panic_cr3); 2035084Sjohnlev 2045084Sjohnlev while (va < xpv_end && va >= *vaddr) { 2055084Sjohnlev /* Find the lowest table with any entry for va */ 2065084Sjohnlev pfn = toplevel_pfn; 2075084Sjohnlev for (l = mmu.max_level; l >= 0; l--) { 2085084Sjohnlev if (ptable_pfn[l] != pfn) { 2095084Sjohnlev xpv_panic_map(l, pfn); 2105084Sjohnlev ptable_pfn[l] = pfn; 2115084Sjohnlev } 2125084Sjohnlev 2135084Sjohnlev /* 2145084Sjohnlev * Search this pagetable for any mapping to an 2155084Sjohnlev * address >= va. 2165084Sjohnlev */ 2175084Sjohnlev ptable_window = PWIN_VA(l); 2185084Sjohnlev if (l == mmu.max_level && mmu.pae_hat) 2195084Sjohnlev ptable_window += 2205084Sjohnlev (xpv_panic_cr3 & MMU_PAGEOFFSET); 2215084Sjohnlev 2225084Sjohnlev idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1); 2235084Sjohnlev scan_va = va; 2245084Sjohnlev while (idx < xpv_panic_nptes[l] && scan_va < xpv_end && 2255084Sjohnlev scan_va >= *vaddr) { 2265084Sjohnlev ptep = (x86pte_t *)(ptable_window + 2275084Sjohnlev (idx << mmu.pte_size_shift)); 2285084Sjohnlev pte = GET_PTE(ptep); 2295084Sjohnlev if (pte & PTE_VALID) 2305084Sjohnlev break; 2315084Sjohnlev idx++; 2325084Sjohnlev scan_va += mmu.level_size[l]; 2335084Sjohnlev } 2345084Sjohnlev 2355084Sjohnlev /* 2365084Sjohnlev * If there are no valid mappings in this table, we 2375084Sjohnlev * can skip to the end of the VA range it covers. 2385084Sjohnlev */ 2395084Sjohnlev if (idx == xpv_panic_nptes[l]) { 2405084Sjohnlev va = NEXT_ENTRY_VA(va, l + 1); 2415084Sjohnlev break; 2425084Sjohnlev } 2435084Sjohnlev 2446144Srab va = scan_va; 2456144Srab /* 2466144Srab * See if we've hit the end of the range. 2476144Srab */ 2486144Srab if (va >= xpv_end || va < *vaddr) 2496144Srab break; 2506144Srab 2515084Sjohnlev /* 2525084Sjohnlev * If this mapping is for a pagetable, we drop down 2535084Sjohnlev * to the next level in the hierarchy and look for 2545084Sjohnlev * a mapping in it. 2555084Sjohnlev */ 2565084Sjohnlev pfn = PTE2MFN(pte, l); 2575084Sjohnlev if (!PTE_ISPAGE(pte, l)) 2585084Sjohnlev continue; 2595084Sjohnlev 2605084Sjohnlev /* 2615084Sjohnlev * The APIC page is magic. Nothing to see here; 2625084Sjohnlev * move along. 2635084Sjohnlev */ 2645084Sjohnlev if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) == 2655084Sjohnlev (va & MMU_PAGEMASK)) { 2665084Sjohnlev va += MMU_PAGESIZE; 2675084Sjohnlev break; 2685084Sjohnlev } 2695084Sjohnlev 2706144Srab /* 2716144Srab * See if the address is within one of the two 2726144Srab * kpm-like regions we want to skip. 2736144Srab */ 2746144Srab if (va >= kpm1_low && va < kpm1_high) { 2756144Srab va = kpm1_high; 2766144Srab break; 2776144Srab } 2786144Srab if (va >= kpm2_low && va < kpm2_high) { 2796144Srab va = kpm2_high; 2805084Sjohnlev break; 2815084Sjohnlev } 2825084Sjohnlev 2835084Sjohnlev /* 2845084Sjohnlev * The Xen panic code only handles small pages. If 2855084Sjohnlev * this mapping is for a large page, we need to 2865084Sjohnlev * identify the consituent page that covers the 2875084Sjohnlev * specific VA we were looking for. 2885084Sjohnlev */ 2895084Sjohnlev if (l > 0) { 2905084Sjohnlev if (l > 1) 2915084Sjohnlev panic("Xen panic can't cope with " 2925084Sjohnlev "giant pages."); 2935084Sjohnlev idx = (va >> LEVEL_SHIFT(0)) & 2945084Sjohnlev (xpv_panic_nptes[0] - 1); 2955084Sjohnlev pfn += idx; 2965084Sjohnlev } 2975084Sjohnlev 2985084Sjohnlev *vaddr = va; 2995084Sjohnlev lastva = va; 3005084Sjohnlev return (pfn | PFN_IS_FOREIGN_MFN); 3015084Sjohnlev } 3025084Sjohnlev } 3035084Sjohnlev return (PFN_INVALID); 3045084Sjohnlev } 3055084Sjohnlev 3065084Sjohnlev /* 3075084Sjohnlev * Walk through the Xen VA space, finding pages that are mapped in. 3085084Sjohnlev * 3095084Sjohnlev * These pages all have MFNs rather than PFNs, meaning they may be outside 3105084Sjohnlev * the physical address space the kernel knows about, or they may collide 3115084Sjohnlev * with PFNs the kernel is using. 3125084Sjohnlev * 3135084Sjohnlev * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs 3145084Sjohnlev * to avoid collisions doesn't work. The pages need to be written to disk 3155084Sjohnlev * in PFN-order or savecore gets confused. We can't allocate memory to 3165084Sjohnlev * contruct a sorted pfn->VA reverse mapping, so we have to write the pages 3175084Sjohnlev * to disk in VA order. 3185084Sjohnlev * 3195084Sjohnlev * To square this circle, we simply make up PFNs for each of Xen's pages. 3205084Sjohnlev * We assign each mapped page a fake PFN in ascending order. These fake 3215084Sjohnlev * PFNs each have the FOREIGN bit set, ensuring that they fall outside the 3225084Sjohnlev * range of Solaris PFNs written by the kernel. 3235084Sjohnlev */ 3245084Sjohnlev int 3255084Sjohnlev dump_xpv_addr() 3265084Sjohnlev { 3275084Sjohnlev uintptr_t va; 3285084Sjohnlev mem_vtop_t mem_vtop; 3295084Sjohnlev 3305084Sjohnlev xpv_dump_pages = 0; 3315084Sjohnlev va = xen_virt_start; 3325084Sjohnlev 3335084Sjohnlev while (xpv_va_walk(&va) != PFN_INVALID) { 3345084Sjohnlev mem_vtop.m_as = &kas; 3355084Sjohnlev mem_vtop.m_va = (void *)va; 3365084Sjohnlev mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 3375084Sjohnlev 3385084Sjohnlev dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 3395084Sjohnlev xpv_dump_pages++; 3405084Sjohnlev 3415084Sjohnlev va += MMU_PAGESIZE; 3425084Sjohnlev } 3435084Sjohnlev 3445084Sjohnlev /* 3455084Sjohnlev * Add the shared_info page. This page actually ends up in the 3465084Sjohnlev * dump twice: once for the Xen va and once for the Solaris va. 3475084Sjohnlev * This isn't ideal, but we don't know the address Xen is using for 3485084Sjohnlev * the page, so we can't share it. 3495084Sjohnlev */ 3505084Sjohnlev mem_vtop.m_as = &kas; 3515084Sjohnlev mem_vtop.m_va = HYPERVISOR_shared_info; 3525084Sjohnlev mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN; 3535084Sjohnlev dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 3545084Sjohnlev xpv_dump_pages++; 3555084Sjohnlev 3565084Sjohnlev return (xpv_dump_pages); 3575084Sjohnlev } 3585084Sjohnlev 3595084Sjohnlev void 3605084Sjohnlev dump_xpv_pfn() 3615084Sjohnlev { 3625084Sjohnlev pfn_t pfn; 3635084Sjohnlev int cnt; 3645084Sjohnlev 3655084Sjohnlev for (cnt = 0; cnt < xpv_dump_pages; cnt++) { 3665084Sjohnlev pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN; 3675084Sjohnlev dumpvp_write(&pfn, sizeof (pfn)); 3685084Sjohnlev } 3695084Sjohnlev } 3705084Sjohnlev 3715084Sjohnlev int 3725084Sjohnlev dump_xpv_data(void *dump_cbuf) 3735084Sjohnlev { 3745084Sjohnlev uintptr_t va; 3755084Sjohnlev uint32_t csize; 3765084Sjohnlev int cnt = 0; 3775084Sjohnlev 3785084Sjohnlev /* 3795084Sjohnlev * XXX: we should probably run this data through a UE check. The 3805084Sjohnlev * catch is that the UE code relies on on_trap() and getpfnum() 3815084Sjohnlev * working. 3825084Sjohnlev */ 3835084Sjohnlev va = xen_virt_start; 3845084Sjohnlev 3855084Sjohnlev while (xpv_va_walk(&va) != PFN_INVALID) { 3865084Sjohnlev csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE); 3875084Sjohnlev dumpvp_write(&csize, sizeof (uint32_t)); 3885084Sjohnlev dumpvp_write(dump_cbuf, csize); 3895084Sjohnlev if (dump_ioerr) { 3905084Sjohnlev dumphdr->dump_flags &= ~DF_COMPLETE; 3915084Sjohnlev return (cnt); 3925084Sjohnlev } 3935084Sjohnlev cnt++; 3945084Sjohnlev va += MMU_PAGESIZE; 3955084Sjohnlev } 3965084Sjohnlev 3975084Sjohnlev /* 3985084Sjohnlev * Finally, dump the shared_info page 3995084Sjohnlev */ 4005084Sjohnlev csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf, 4015084Sjohnlev PAGESIZE); 4025084Sjohnlev dumpvp_write(&csize, sizeof (uint32_t)); 4035084Sjohnlev dumpvp_write(dump_cbuf, csize); 4045084Sjohnlev if (dump_ioerr) 4055084Sjohnlev dumphdr->dump_flags &= ~DF_COMPLETE; 4065084Sjohnlev cnt++; 4075084Sjohnlev 4085084Sjohnlev return (cnt); 4095084Sjohnlev } 4105084Sjohnlev 4115084Sjohnlev static void * 4125084Sjohnlev showstack(void *fpreg, int xpv_only) 4135084Sjohnlev { 4145084Sjohnlev struct frame *fpp; 4155084Sjohnlev ulong_t off; 4165084Sjohnlev char *sym; 4175084Sjohnlev uintptr_t pc, fp, lastfp; 4185084Sjohnlev uintptr_t minaddr = min(KERNELBASE, xen_virt_start); 4195084Sjohnlev 4205084Sjohnlev fp = (uintptr_t)fpreg; 4215084Sjohnlev if (fp < minaddr) { 4225084Sjohnlev xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg); 4235084Sjohnlev return (fpreg); 4245084Sjohnlev } 4255084Sjohnlev 4265084Sjohnlev do { 4275084Sjohnlev fpp = (struct frame *)fp; 4285084Sjohnlev pc = fpp->fr_savpc; 4295084Sjohnlev 4305084Sjohnlev if ((xpv_only != 0) && 4315084Sjohnlev (fp > xpv_end || fp < xen_virt_start)) 4325084Sjohnlev break; 4335084Sjohnlev if ((sym = kobj_getsymname(pc, &off)) != NULL) 4345084Sjohnlev xpv_panic_printf("%08lx %s:%s+%lx\n", fp, 4355084Sjohnlev mod_containing_pc((caddr_t)pc), sym, off); 4365084Sjohnlev else if ((pc >= xen_virt_start) && (pc <= xpv_end)) 4375084Sjohnlev xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc); 4385084Sjohnlev else 4395084Sjohnlev xpv_panic_printf("%08lx %lx\n", fp, pc); 4405084Sjohnlev 4415084Sjohnlev lastfp = fp; 4425084Sjohnlev fp = fpp->fr_savfp; 4435084Sjohnlev 4445084Sjohnlev /* 4455084Sjohnlev * Xen marks an exception frame by inverting the frame 4465084Sjohnlev * pointer. 4475084Sjohnlev */ 4485084Sjohnlev if (fp < lastfp) { 4495084Sjohnlev if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff) 4505084Sjohnlev fp = ~fp; 4515084Sjohnlev } 4525084Sjohnlev } while (fp > lastfp); 4535084Sjohnlev return ((void *)fp); 4545084Sjohnlev } 4555084Sjohnlev 4565084Sjohnlev void * 4575084Sjohnlev xpv_traceback(void *fpreg) 4585084Sjohnlev { 4595084Sjohnlev return (showstack(fpreg, 1)); 4605084Sjohnlev } 4615084Sjohnlev 4625084Sjohnlev #if defined(__amd64) 4635084Sjohnlev static void 4645084Sjohnlev xpv_panic_hypercall(ulong_t call) 4655084Sjohnlev { 4665084Sjohnlev panic("Illegally issued hypercall %d during panic!\n", (int)call); 4675084Sjohnlev } 4685084Sjohnlev #endif 4695084Sjohnlev 4705084Sjohnlev void 4715084Sjohnlev xpv_die(struct regs *rp) 4725084Sjohnlev { 4735084Sjohnlev struct panic_trap_info ti; 4745084Sjohnlev struct cregs creg; 4755084Sjohnlev 4765084Sjohnlev ti.trap_regs = rp; 4775084Sjohnlev ti.trap_type = rp->r_trapno; 4785084Sjohnlev 4795084Sjohnlev curthread->t_panic_trap = &ti; 4805084Sjohnlev if (ti.trap_type == T_PGFLT) { 4815084Sjohnlev getcregs(&creg); 4825084Sjohnlev ti.trap_addr = (caddr_t)creg.cr_cr2; 4835084Sjohnlev panic("Fatal pagefault at 0x%lx. fault addr=0x%p rp=0x%p", 4847240Srh87107 rp->r_pc, (void *)ti.trap_addr, (void *)rp); 4855084Sjohnlev } else { 4865084Sjohnlev ti.trap_addr = (caddr_t)rp->r_pc; 4875084Sjohnlev panic("Fatal trap %ld at 0x%lx. rp=0x%p", rp->r_trapno, 4887240Srh87107 rp->r_pc, (void *)rp); 4895084Sjohnlev } 4905084Sjohnlev } 4915084Sjohnlev 4925084Sjohnlev /* 4935084Sjohnlev * Build IDT to handle a Xen panic 4945084Sjohnlev */ 4955084Sjohnlev static void 4965084Sjohnlev switch_to_xpv_panic_idt() 4975084Sjohnlev { 4985084Sjohnlev int i; 4995084Sjohnlev desctbr_t idtr; 5005084Sjohnlev gate_desc_t *idt = xpv_panic_idt; 5015084Sjohnlev selector_t cs = get_cs_register(); 5025084Sjohnlev 5035084Sjohnlev for (i = 0; i < 32; i++) 504*8679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL, 505*8679SSeth.Goldberg@Sun.COM 0); 5065084Sjohnlev 507*8679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL, 508*8679SSeth.Goldberg@Sun.COM 0); 509*8679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0); 510*8679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0); 5115084Sjohnlev set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT, 512*8679SSeth.Goldberg@Sun.COM TRP_XPL, 0); 513*8679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL, 514*8679SSeth.Goldberg@Sun.COM 0); 515*8679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL, 516*8679SSeth.Goldberg@Sun.COM 0); 517*8679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL, 518*8679SSeth.Goldberg@Sun.COM 0); 519*8679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL, 520*8679SSeth.Goldberg@Sun.COM 0); 521*8679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0); 522*8679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0); 523*8679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0); 524*8679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL, 525*8679SSeth.Goldberg@Sun.COM 0); 526*8679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL, 527*8679SSeth.Goldberg@Sun.COM 0); 528*8679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0); 529*8679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0); 5305084Sjohnlev 5315084Sjohnlev /* 5325084Sjohnlev * We have no double fault handler. Any single fault represents a 5335084Sjohnlev * catastrophic failure for us, so there is no attempt to handle 5345084Sjohnlev * them cleanly: we just print a message and reboot. If we 5355084Sjohnlev * encounter a second fault while doing that, there is nothing 5365084Sjohnlev * else we can do. 5375084Sjohnlev */ 5385084Sjohnlev 5395084Sjohnlev /* 5405084Sjohnlev * Be prepared to absorb any stray device interrupts received 5415084Sjohnlev * while writing the core to disk. 5425084Sjohnlev */ 5435084Sjohnlev for (i = 33; i < NIDT; i++) 5445084Sjohnlev set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT, 545*8679SSeth.Goldberg@Sun.COM TRP_XPL, 0); 5465084Sjohnlev 5475084Sjohnlev /* The one interrupt we expect to get is from the APIC timer. */ 5485084Sjohnlev set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT, 549*8679SSeth.Goldberg@Sun.COM TRP_XPL, 0); 5505084Sjohnlev 5515084Sjohnlev idtr.dtr_base = (uintptr_t)xpv_panic_idt; 5525084Sjohnlev idtr.dtr_limit = sizeof (xpv_panic_idt) - 1; 5535084Sjohnlev wr_idtr(&idtr); 5545084Sjohnlev 5555084Sjohnlev #if defined(__amd64) 5565084Sjohnlev /* Catch any hypercalls. */ 5575084Sjohnlev wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall); 5585084Sjohnlev wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall); 5595084Sjohnlev #endif 5605084Sjohnlev } 5615084Sjohnlev 5625084Sjohnlev static void 5635084Sjohnlev xpv_apic_clkinit() 5645084Sjohnlev { 5655084Sjohnlev uint_t apic_ticks = 0; 5665084Sjohnlev 5675084Sjohnlev /* 5685084Sjohnlev * Measure how many APIC ticks there are within a fixed time 5695084Sjohnlev * period. We're going to be fairly coarse here. This timer is 5705084Sjohnlev * just being used to detect a stalled panic, so as long as we have 5715084Sjohnlev * the right order of magnitude, everything should be fine. 5725084Sjohnlev */ 5735084Sjohnlev xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR; 5745084Sjohnlev xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK; 5755084Sjohnlev xpv_apicadr[APIC_INT_VECT0] = AV_MASK; /* local intr reg 0 */ 5765084Sjohnlev 5775084Sjohnlev xpv_apicadr[APIC_DIVIDE_REG] = 0; 5785084Sjohnlev xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL; 5795084Sjohnlev drv_usecwait(XPV_TIMER_INTERVAL); 5805084Sjohnlev apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT]; 5815084Sjohnlev 5825084Sjohnlev /* 5835084Sjohnlev * apic_ticks now represents roughly how many apic ticks comprise 5845084Sjohnlev * one timeout interval. Program the timer to send us an interrupt 5855084Sjohnlev * every time that interval expires. 5865084Sjohnlev */ 5875084Sjohnlev xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_TIME; 5885084Sjohnlev xpv_apicadr[APIC_INIT_COUNT] = apic_ticks; 5895084Sjohnlev xpv_apicadr[APIC_EOI_REG] = 0; 5905084Sjohnlev } 5915084Sjohnlev 5925084Sjohnlev void 5935084Sjohnlev xpv_timer_tick(void) 5945084Sjohnlev { 5955084Sjohnlev static int ticks = 0; 5965084Sjohnlev 5975084Sjohnlev if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) { 5985084Sjohnlev ticks = 0; 5995084Sjohnlev if (dump_timeleft && (--dump_timeleft == 0)) 6005084Sjohnlev panic("Xen panic timeout\n"); 6015084Sjohnlev } 6025084Sjohnlev xpv_apicadr[APIC_EOI_REG] = 0; 6035084Sjohnlev } 6045084Sjohnlev 6055084Sjohnlev void 6065084Sjohnlev xpv_interrupt(void) 6075084Sjohnlev { 6085084Sjohnlev #ifdef DEBUG 6095084Sjohnlev static int cnt = 0; 6105084Sjohnlev 6115084Sjohnlev if (cnt++ < 10) 6125084Sjohnlev xpv_panic_printf("Unexpected interrupt received.\n"); 6135084Sjohnlev if ((cnt < 1000) && ((cnt % 100) == 0)) 6145084Sjohnlev xpv_panic_printf("%d unexpected interrupts received.\n", cnt); 6155084Sjohnlev #endif 6165084Sjohnlev 6175084Sjohnlev xpv_apicadr[APIC_EOI_REG] = 0; 6185084Sjohnlev } 6195084Sjohnlev 6205084Sjohnlev /* 6215084Sjohnlev * Managing time in panic context is trivial. We only have a single CPU, 6225084Sjohnlev * we never get rescheduled, we never get suspended. We just need to 6235084Sjohnlev * convert clock ticks into nanoseconds. 6245084Sjohnlev */ 6255084Sjohnlev static hrtime_t 6265084Sjohnlev xpv_panic_gethrtime(void) 6275084Sjohnlev { 6285084Sjohnlev hrtime_t tsc, hrt; 6295084Sjohnlev unsigned int *l = (unsigned int *)&(tsc); 6305084Sjohnlev 6315084Sjohnlev tsc = __rdtsc_insn(); 6325084Sjohnlev hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) + 6335084Sjohnlev (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT)); 6345084Sjohnlev 6355084Sjohnlev return (hrt); 6365084Sjohnlev } 6375084Sjohnlev 6385084Sjohnlev static void 6395084Sjohnlev xpv_panic_time_init() 6405084Sjohnlev { 6415084Sjohnlev nsec_scale = 6425084Sjohnlev CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT; 6435084Sjohnlev 6445084Sjohnlev gethrtimef = xpv_panic_gethrtime; 6455084Sjohnlev } 6465084Sjohnlev 6475084Sjohnlev static void 6485084Sjohnlev xpv_panicsys(struct regs *rp, char *fmt, ...) 6495084Sjohnlev { 6505084Sjohnlev extern void panicsys(const char *, va_list, struct regs *, int); 6515084Sjohnlev va_list alist; 6525084Sjohnlev 6535084Sjohnlev va_start(alist, fmt); 6545084Sjohnlev panicsys(fmt, alist, rp, 1); 6555084Sjohnlev va_end(alist); 6565084Sjohnlev } 6575084Sjohnlev 6585084Sjohnlev void 6595084Sjohnlev xpv_do_panic(void *arg) 6605084Sjohnlev { 6615084Sjohnlev struct panic_info *pip = (struct panic_info *)arg; 6625084Sjohnlev int l; 6635084Sjohnlev struct cregs creg; 6645084Sjohnlev #if defined(__amd64) 6655084Sjohnlev extern uintptr_t postbootkernelbase; 6665084Sjohnlev #endif 6675084Sjohnlev 6685084Sjohnlev if (xpv_panicking++ > 0) 6695084Sjohnlev panic("multiple calls to xpv_do_panic()"); 6705084Sjohnlev 6715084Sjohnlev /* 6725084Sjohnlev * Indicate to the underlying panic framework that a panic has been 6735084Sjohnlev * initiated. This is ordinarily done as part of vpanic(). Since 6745084Sjohnlev * we already have all the register state saved by the hypervisor, 6755084Sjohnlev * we skip that and jump straight into the panic processing code. 6767532SSean.Ye@Sun.COM * 6777532SSean.Ye@Sun.COM * XXX If another thread grabs and wins the panic_quiesce trigger 6787532SSean.Ye@Sun.COM * then we'll have two threads in panicsys believing they are in 6797532SSean.Ye@Sun.COM * charge of the panic attempt! 6805084Sjohnlev */ 6815084Sjohnlev (void) panic_trigger(&panic_quiesce); 6825084Sjohnlev 6835084Sjohnlev #if defined(__amd64) 6845084Sjohnlev /* 6855084Sjohnlev * bzero() and bcopy() get unhappy when asked to operate on 6865084Sjohnlev * addresses outside of the kernel. At this point Xen is really a 6875084Sjohnlev * part of the kernel, so we update the routines' notion of where 6885084Sjohnlev * the kernel starts. 6895084Sjohnlev */ 6905084Sjohnlev postbootkernelbase = xen_virt_start; 6915084Sjohnlev #endif 6925084Sjohnlev 6935084Sjohnlev #if defined(HYPERVISOR_VIRT_END) 6945084Sjohnlev xpv_end = HYPERVISOR_VIRT_END; 6955084Sjohnlev #else 6965084Sjohnlev xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t); 6975084Sjohnlev #endif 6985084Sjohnlev 6995084Sjohnlev /* 7005084Sjohnlev * If we were redirecting console output to the hypervisor, we have 7015084Sjohnlev * to stop. 7025084Sjohnlev */ 7035084Sjohnlev use_polledio = B_FALSE; 7045084Sjohnlev if (console == CONS_HYPERVISOR) { 7055084Sjohnlev bcons_device_change(CONS_HYPERVISOR); 7065084Sjohnlev } else if (cons_polledio != NULL && 7075084Sjohnlev cons_polledio->cons_polledio_putchar != NULL) { 7085084Sjohnlev if (cons_polledio->cons_polledio_enter != NULL) 7095084Sjohnlev cons_polledio->cons_polledio_enter( 7105084Sjohnlev cons_polledio->cons_polledio_argument); 7115084Sjohnlev use_polledio = 1; 7125084Sjohnlev } 7135084Sjohnlev 7145084Sjohnlev /* Make sure we handle all console output from here on. */ 7155084Sjohnlev sysp->bsvc_putchar = xpv_panic_putc; 7165084Sjohnlev 7175084Sjohnlev /* 7185084Sjohnlev * If we find an unsupported panic_info structure, there's not much 7195084Sjohnlev * we can do other than complain, plow on, and hope for the best. 7205084Sjohnlev */ 7215084Sjohnlev if (pip->pi_version != PANIC_INFO_VERSION) 7225084Sjohnlev xpv_panic_printf("Warning: Xen is using an unsupported " 7235084Sjohnlev "version of the panic_info structure.\n"); 7245084Sjohnlev 7255084Sjohnlev xpv_panic_info = pip; 7265084Sjohnlev 7276144Srab #if defined(__amd64) 7286144Srab kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start; 7296144Srab if (xpv_panic_info->pi_xen_start == NULL) { 7306144Srab kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end; 7316144Srab } else { 7326144Srab kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start; 7336144Srab kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end; 7346144Srab kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end; 7356144Srab } 7366144Srab #endif 7376144Srab 7385084Sjohnlev /* 7395084Sjohnlev * Make sure we are running on the Solaris %gs. The Xen panic code 7405084Sjohnlev * should already have set up the GDT properly. 7415084Sjohnlev */ 7425084Sjohnlev xpv_panic_resetgs(); 7435084Sjohnlev #if defined(__amd64) 7445084Sjohnlev wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]); 7455084Sjohnlev #endif 7465084Sjohnlev 7475084Sjohnlev xpv_panic_time_init(); 7485084Sjohnlev 7495084Sjohnlev /* 7505084Sjohnlev * Switch to our own IDT, avoiding any accidental returns to Xen 7515084Sjohnlev * world. 7525084Sjohnlev */ 7535084Sjohnlev switch_to_xpv_panic_idt(); 7545084Sjohnlev 7555084Sjohnlev /* 7565084Sjohnlev * Initialize the APIC timer, which is used to detect a hung dump 7575084Sjohnlev * attempt. 7585084Sjohnlev */ 7595084Sjohnlev xpv_apicadr = pip->pi_apic; 7605084Sjohnlev xpv_apic_clkinit(); 7615084Sjohnlev 7625084Sjohnlev /* 7635084Sjohnlev * Set up a few values that we'll need repeatedly. 7645084Sjohnlev */ 7655084Sjohnlev getcregs(&creg); 7665084Sjohnlev xpv_panic_cr3 = creg.cr_cr3; 7675084Sjohnlev for (l = mmu.max_level; l >= 0; l--) 7685084Sjohnlev xpv_panic_nptes[l] = mmu.ptes_per_table; 7695084Sjohnlev #ifdef __i386 7705084Sjohnlev if (mmu.pae_hat) 7715084Sjohnlev xpv_panic_nptes[mmu.max_level] = 4; 7725084Sjohnlev #endif 7735084Sjohnlev 7745084Sjohnlev /* Add the fake Xen module to the module list */ 7755084Sjohnlev if (xpv_module != NULL) { 7765084Sjohnlev extern int last_module_id; 7775084Sjohnlev 7785084Sjohnlev xpv_modctl->mod_id = last_module_id++; 7795084Sjohnlev xpv_modctl->mod_next = &modules; 7805084Sjohnlev xpv_modctl->mod_prev = modules.mod_prev; 7815084Sjohnlev modules.mod_prev->mod_next = xpv_modctl; 7825084Sjohnlev modules.mod_prev = xpv_modctl; 7835084Sjohnlev } 7847532SSean.Ye@Sun.COM 7857532SSean.Ye@Sun.COM if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC) 7867532SSean.Ye@Sun.COM xpv_mca_panic_data = &pip->pi_mca; 7877532SSean.Ye@Sun.COM 7885084Sjohnlev xpv_panic_printf = printf; 7895084Sjohnlev xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr); 7905084Sjohnlev xpv_panic_printf("Failed to reboot following panic.\n"); 7915084Sjohnlev for (;;) 7925084Sjohnlev ; 7935084Sjohnlev } 7945084Sjohnlev 7955084Sjohnlev /* 7965084Sjohnlev * Set up the necessary data structures to pretend that the Xen hypervisor 7975084Sjohnlev * is a loadable module, allowing mdb to find the Xen symbols in a crash 7985084Sjohnlev * dump. Since these symbols all map to VA space Solaris doesn't normally 7995084Sjohnlev * have access to, we don't link these structures into the kernel's lists 8005084Sjohnlev * until/unless we hit a Xen panic. 8015084Sjohnlev * 8025084Sjohnlev * The observant reader will note a striking amount of overlap between this 8035084Sjohnlev * code and that found in krtld. While it would be handy if we could just 8045084Sjohnlev * ask krtld to do this work for us, it's not that simple. Among the 8055084Sjohnlev * complications: we're not actually loading the text here (grub did it at 8065084Sjohnlev * boot), the .text section is writable, there are no relocations to do, 8075084Sjohnlev * none of the module text/data is in readable memory, etc. Training krtld 8085084Sjohnlev * to deal with this weird module is as complicated, and more risky, than 8095084Sjohnlev * reimplementing the necessary subset of it here. 8105084Sjohnlev */ 8115084Sjohnlev static void 8125084Sjohnlev init_xen_module() 8135084Sjohnlev { 8145084Sjohnlev struct _buf *file = NULL; 8155084Sjohnlev struct module *mp; 8165084Sjohnlev struct modctl *mcp; 8175084Sjohnlev int i, shn; 8185084Sjohnlev Shdr *shp, *ctf_shp; 8195084Sjohnlev char *names = NULL; 8205084Sjohnlev size_t n, namesize, text_align, data_align; 8215084Sjohnlev #if defined(__amd64) 8225084Sjohnlev const char machine = EM_AMD64; 8235084Sjohnlev #else 8245084Sjohnlev const char machine = EM_386; 8255084Sjohnlev #endif 8265084Sjohnlev 8275084Sjohnlev /* Allocate and init the module structure */ 8285084Sjohnlev mp = kmem_zalloc(sizeof (*mp), KM_SLEEP); 8295084Sjohnlev mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 8305084Sjohnlev (void) strcpy(mp->filename, XPV_FILENAME); 8315084Sjohnlev 8325084Sjohnlev /* Allocate and init the modctl structure */ 8335084Sjohnlev mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP); 8345084Sjohnlev mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP); 8355084Sjohnlev (void) strcpy(mcp->mod_modname, XPV_MODNAME); 8365084Sjohnlev mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP); 8375084Sjohnlev (void) strcpy(mcp->mod_filename, XPV_FILENAME); 8385084Sjohnlev mcp->mod_inprogress_thread = (kthread_id_t)-1; 8395084Sjohnlev mcp->mod_ref = 1; 8405084Sjohnlev mcp->mod_loaded = 1; 8415084Sjohnlev mcp->mod_loadcnt = 1; 8425084Sjohnlev mcp->mod_mp = mp; 8435084Sjohnlev 8445084Sjohnlev /* 8455084Sjohnlev * Try to open a Xen image that hasn't had its symbol and CTF 8465084Sjohnlev * information stripped off. 8475084Sjohnlev */ 8485084Sjohnlev file = kobj_open_file(XPV_FILENAME); 8495084Sjohnlev if (file == (struct _buf *)-1) { 8505084Sjohnlev file = NULL; 8515084Sjohnlev goto err; 8525084Sjohnlev } 8535084Sjohnlev 8545084Sjohnlev /* 8555084Sjohnlev * Read the header and ensure that this is an ELF file for the 8565084Sjohnlev * proper ISA. If it's not, somebody has done something very 8575084Sjohnlev * stupid. Why bother? See Mencken. 8585084Sjohnlev */ 8595084Sjohnlev if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0) 8605084Sjohnlev goto err; 8615084Sjohnlev for (i = 0; i < SELFMAG; i++) 8625084Sjohnlev if (mp->hdr.e_ident[i] != ELFMAG[i]) 8635084Sjohnlev goto err; 8645084Sjohnlev if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) || 8655084Sjohnlev (mp->hdr.e_machine != machine)) 8665084Sjohnlev goto err; 8675084Sjohnlev 8685084Sjohnlev /* Read in the section headers */ 8695084Sjohnlev n = mp->hdr.e_shentsize * mp->hdr.e_shnum; 8705084Sjohnlev mp->shdrs = kmem_zalloc(n, KM_SLEEP); 8715084Sjohnlev if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0) 8725084Sjohnlev goto err; 8735084Sjohnlev 8745084Sjohnlev /* Read the section names */ 8755084Sjohnlev shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize); 8765084Sjohnlev namesize = shp->sh_size; 8775084Sjohnlev names = kmem_zalloc(shp->sh_size, KM_SLEEP); 8785084Sjohnlev if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0) 8795084Sjohnlev goto err; 8805084Sjohnlev 8815084Sjohnlev /* 8825084Sjohnlev * Fill in the text and data size fields. 8835084Sjohnlev */ 8845084Sjohnlev ctf_shp = NULL; 8855084Sjohnlev text_align = data_align = 0; 8865084Sjohnlev for (shn = 1; shn < mp->hdr.e_shnum; shn++) { 8875084Sjohnlev shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize); 8885084Sjohnlev 8895084Sjohnlev /* Sanity check the offset of the section name */ 8905084Sjohnlev if (shp->sh_name >= namesize) 8915084Sjohnlev continue; 8925084Sjohnlev 8935084Sjohnlev /* If we find the symtab section, remember it for later. */ 8945084Sjohnlev if (shp->sh_type == SHT_SYMTAB) { 8955084Sjohnlev mp->symtbl_section = shn; 8965084Sjohnlev mp->symhdr = shp; 8975084Sjohnlev continue; 8985084Sjohnlev } 8995084Sjohnlev 9005084Sjohnlev /* If we find the CTF section, remember it for later. */ 9015084Sjohnlev if ((shp->sh_size != 0) && 9025084Sjohnlev (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) { 9035084Sjohnlev ctf_shp = shp; 9045084Sjohnlev continue; 9055084Sjohnlev } 9065084Sjohnlev 9075084Sjohnlev if (!(shp->sh_flags & SHF_ALLOC)) 9085084Sjohnlev continue; 9095084Sjohnlev 9105084Sjohnlev /* 9115084Sjohnlev * Xen marks its text section as writable, so we need to 9125084Sjohnlev * look for the name - not just the flag. 9135084Sjohnlev */ 9145084Sjohnlev if ((strcmp(&names[shp->sh_name], ".text") != NULL) && 9155084Sjohnlev (shp->sh_flags & SHF_WRITE) != 0) { 9165084Sjohnlev if (shp->sh_addralign > data_align) 9175084Sjohnlev data_align = shp->sh_addralign; 9185084Sjohnlev mp->data_size = ALIGN(mp->data_size, data_align); 9195084Sjohnlev mp->data_size += ALIGN(shp->sh_size, 8); 9205084Sjohnlev if (mp->data == NULL || mp->data > (char *)shp->sh_addr) 9215084Sjohnlev mp->data = (char *)shp->sh_addr; 9225084Sjohnlev } else { 9235084Sjohnlev if (shp->sh_addralign > text_align) 9245084Sjohnlev text_align = shp->sh_addralign; 9255084Sjohnlev mp->text_size = ALIGN(mp->text_size, text_align); 9265084Sjohnlev mp->text_size += ALIGN(shp->sh_size, 8); 9275084Sjohnlev if (mp->text == NULL || mp->text > (char *)shp->sh_addr) 9285084Sjohnlev mp->text = (char *)shp->sh_addr; 9295084Sjohnlev } 9305084Sjohnlev } 9315084Sjohnlev kmem_free(names, namesize); 9325084Sjohnlev names = NULL; 9335249Snn35248 shp = NULL; 9345084Sjohnlev mcp->mod_text = mp->text; 9355084Sjohnlev mcp->mod_text_size = mp->text_size; 9365084Sjohnlev 9375084Sjohnlev /* 9385084Sjohnlev * If we have symbol table and string table sections, read them in 9395084Sjohnlev * now. If we don't, we just plow on. We'll still get a valid 9405084Sjohnlev * core dump, but finding anything useful will be just a bit 9415084Sjohnlev * harder. 9425084Sjohnlev * 9435084Sjohnlev * Note: we don't bother with a hash table. We'll never do a 9445084Sjohnlev * symbol lookup unless we crash, and then mdb creates its own. We 9455084Sjohnlev * also don't try to perform any relocations. Xen should be loaded 9465084Sjohnlev * exactly where the ELF file indicates, and the symbol information 9475084Sjohnlev * in the file should be complete and correct already. Static 9485084Sjohnlev * linking ain't all bad. 9495084Sjohnlev */ 9505084Sjohnlev if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) { 9515084Sjohnlev mp->strhdr = (Shdr *) 9525084Sjohnlev (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize); 9535084Sjohnlev mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize; 9545084Sjohnlev 9555084Sjohnlev /* Allocate space for the symbol table and strings. */ 9565084Sjohnlev mp->symsize = mp->symhdr->sh_size + 9575084Sjohnlev mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size; 9585084Sjohnlev mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP); 9595084Sjohnlev mp->symtbl = mp->symspace; 9605084Sjohnlev mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size); 9615084Sjohnlev 9625084Sjohnlev if ((kobj_read_file(file, mp->symtbl, 9635084Sjohnlev mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) || 9645084Sjohnlev (kobj_read_file(file, mp->strings, 9655084Sjohnlev mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0)) 9665084Sjohnlev goto err; 9675084Sjohnlev } 9685084Sjohnlev 9695084Sjohnlev /* 9705084Sjohnlev * Read in the CTF section 9715084Sjohnlev */ 9725084Sjohnlev if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) { 9735249Snn35248 mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP); 9745084Sjohnlev mp->ctfsize = ctf_shp->sh_size; 9755084Sjohnlev if (kobj_read_file(file, mp->ctfdata, mp->ctfsize, 9765084Sjohnlev ctf_shp->sh_offset) < 0) 9775084Sjohnlev goto err; 9785084Sjohnlev } 9795084Sjohnlev 9805084Sjohnlev kobj_close_file(file); 9815084Sjohnlev 9825084Sjohnlev xpv_module = mp; 9835084Sjohnlev xpv_modctl = mcp; 9845084Sjohnlev return; 9855084Sjohnlev 9865084Sjohnlev err: 9875084Sjohnlev cmn_err(CE_WARN, "Failed to initialize xpv module."); 9885084Sjohnlev if (file != NULL) 9895084Sjohnlev kobj_close_file(file); 9905084Sjohnlev 9915084Sjohnlev kmem_free(mp->filename, strlen(XPV_FILENAME) + 1); 9925084Sjohnlev if (mp->shdrs != NULL) 9935084Sjohnlev kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum); 9945084Sjohnlev if (mp->symspace != NULL) 9955084Sjohnlev kmem_free(mp->symspace, mp->symsize); 9965084Sjohnlev if (mp->ctfdata != NULL) 9975084Sjohnlev kmem_free(mp->ctfdata, mp->ctfsize); 9985084Sjohnlev kmem_free(mp, sizeof (*mp)); 9995084Sjohnlev kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1); 10005084Sjohnlev kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1); 10015084Sjohnlev kmem_free(mcp, sizeof (*mcp)); 10025084Sjohnlev if (names != NULL) 10035084Sjohnlev kmem_free(names, namesize); 10045084Sjohnlev } 10055084Sjohnlev 10065084Sjohnlev void 10075084Sjohnlev xpv_panic_init() 10085084Sjohnlev { 10095084Sjohnlev xen_platform_op_t op; 10105084Sjohnlev int i; 10115084Sjohnlev 10125084Sjohnlev ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); 10135084Sjohnlev 10145084Sjohnlev for (i = 0; i < mmu.num_level; i++) 10155084Sjohnlev ptable_pfn[i] = PFN_INVALID; 10165084Sjohnlev 10175084Sjohnlev /* Let Xen know where to jump if/when it panics. */ 10185084Sjohnlev op.cmd = XENPF_panic_init; 10195084Sjohnlev op.interface_version = XENPF_INTERFACE_VERSION; 10205084Sjohnlev op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr; 10215084Sjohnlev 10225084Sjohnlev (void) HYPERVISOR_platform_op(&op); 10235084Sjohnlev 10245084Sjohnlev init_xen_module(); 10255084Sjohnlev } 1026