15084Sjohnlev /*
25084Sjohnlev * CDDL HEADER START
35084Sjohnlev *
45084Sjohnlev * The contents of this file are subject to the terms of the
55084Sjohnlev * Common Development and Distribution License (the "License").
65084Sjohnlev * You may not use this file except in compliance with the License.
75084Sjohnlev *
85084Sjohnlev * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
95084Sjohnlev * or http://www.opensolaris.org/os/licensing.
105084Sjohnlev * See the License for the specific language governing permissions
115084Sjohnlev * and limitations under the License.
125084Sjohnlev *
135084Sjohnlev * When distributing Covered Code, include this CDDL HEADER in each
145084Sjohnlev * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
155084Sjohnlev * If applicable, add the following below this CDDL HEADER, with the
165084Sjohnlev * fields enclosed by brackets "[]" replaced with your own identifying
175084Sjohnlev * information: Portions Copyright [yyyy] [name of copyright owner]
185084Sjohnlev *
195084Sjohnlev * CDDL HEADER END
205084Sjohnlev */
215084Sjohnlev /*
22*13029SKrishnendu.Sadhukhan@Sun.COM * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
235084Sjohnlev */
245084Sjohnlev
255084Sjohnlev #include <sys/types.h>
265084Sjohnlev #include <sys/clock.h>
275084Sjohnlev #include <sys/psm.h>
285084Sjohnlev #include <sys/archsystm.h>
295084Sjohnlev #include <sys/machsystm.h>
305084Sjohnlev #include <sys/compress.h>
315084Sjohnlev #include <sys/modctl.h>
325084Sjohnlev #include <sys/trap.h>
335084Sjohnlev #include <sys/panic.h>
345084Sjohnlev #include <sys/regset.h>
355084Sjohnlev #include <sys/frame.h>
365084Sjohnlev #include <sys/kobj.h>
375084Sjohnlev #include <sys/apic.h>
38*13029SKrishnendu.Sadhukhan@Sun.COM #include <sys/apic_timer.h>
395084Sjohnlev #include <sys/dumphdr.h>
405084Sjohnlev #include <sys/mem.h>
415084Sjohnlev #include <sys/x86_archext.h>
425084Sjohnlev #include <sys/xpv_panic.h>
435084Sjohnlev #include <sys/boot_console.h>
445084Sjohnlev #include <sys/bootsvcs.h>
455084Sjohnlev #include <sys/consdev.h>
465084Sjohnlev #include <vm/hat_pte.h>
475084Sjohnlev #include <vm/hat_i86.h>
485084Sjohnlev
495084Sjohnlev /* XXX: need to add a PAE version too, if we ever support both PAE and non */
505084Sjohnlev #if defined(__i386)
515084Sjohnlev #define XPV_FILENAME "/boot/xen-syms"
525084Sjohnlev #else
535084Sjohnlev #define XPV_FILENAME "/boot/amd64/xen-syms"
545084Sjohnlev #endif
555084Sjohnlev #define XPV_MODNAME "xpv"
565084Sjohnlev
575084Sjohnlev int xpv_panicking = 0;
585084Sjohnlev
595084Sjohnlev struct module *xpv_module;
605084Sjohnlev struct modctl *xpv_modctl;
615084Sjohnlev
625084Sjohnlev #define ALIGN(x, a) ((a) == 0 ? (uintptr_t)(x) : \
635084Sjohnlev (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l)))
645084Sjohnlev
655084Sjohnlev /* Pointer to the xpv_panic_info structure handed to us by Xen. */
665084Sjohnlev static struct panic_info *xpv_panic_info = NULL;
675084Sjohnlev
685084Sjohnlev /* Timer support */
695084Sjohnlev #define NSEC_SHIFT 5
705084Sjohnlev #define T_XPV_TIMER 0xd1
715084Sjohnlev #define XPV_TIMER_INTERVAL 1000 /* 1000 microseconds */
725084Sjohnlev static uint32_t *xpv_apicadr = NULL;
735084Sjohnlev static uint_t nsec_scale;
745084Sjohnlev
755084Sjohnlev /* IDT support */
765084Sjohnlev #pragma align 16(xpv_panic_idt)
775084Sjohnlev static gate_desc_t xpv_panic_idt[NIDT]; /* interrupt descriptor table */
785084Sjohnlev
795084Sjohnlev /* Xen pagetables mapped into our HAT's ptable windows */
805084Sjohnlev static pfn_t ptable_pfn[MAX_NUM_LEVEL];
815084Sjohnlev
825084Sjohnlev /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */
835084Sjohnlev static int xpv_dump_pages;
845084Sjohnlev
855084Sjohnlev /*
866144Srab * There are up to two large swathes of RAM that we don't want to include
876144Srab * in the dump: those that comprise the Xen version of segkpm. On 32-bit
886144Srab * systems there is no such region of memory. On 64-bit systems, there
896144Srab * should be just a single contiguous region that corresponds to all of
906144Srab * physical memory. The tricky bit is that Xen's heap sometimes lives in
916144Srab * the middle of their segkpm, and is mapped using only kpm-like addresses.
926144Srab * In that case, we need to skip the swathes before and after Xen's heap.
936144Srab */
946144Srab uintptr_t kpm1_low = 0;
956144Srab uintptr_t kpm1_high = 0;
966144Srab uintptr_t kpm2_low = 0;
976144Srab uintptr_t kpm2_high = 0;
986144Srab
996144Srab /*
1005084Sjohnlev * Some commonly used values that we don't want to recompute over and over.
1015084Sjohnlev */
1025084Sjohnlev static int xpv_panic_nptes[MAX_NUM_LEVEL];
1035084Sjohnlev static ulong_t xpv_panic_cr3;
1045084Sjohnlev static uintptr_t xpv_end;
1055084Sjohnlev
1065084Sjohnlev static void xpv_panic_console_print(const char *fmt, ...);
1075084Sjohnlev static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print;
1085084Sjohnlev
1095084Sjohnlev #define CONSOLE_BUF_SIZE 256
1105084Sjohnlev static char console_buffer[CONSOLE_BUF_SIZE];
1115084Sjohnlev static boolean_t use_polledio;
1125084Sjohnlev
1137532SSean.Ye@Sun.COM /*
1147532SSean.Ye@Sun.COM * Pointers to machine check panic info (if any).
1157532SSean.Ye@Sun.COM */
1167532SSean.Ye@Sun.COM xpv_mca_panic_data_t *xpv_mca_panic_data = NULL;
1177532SSean.Ye@Sun.COM
1185084Sjohnlev static void
xpv_panic_putc(int m)1195084Sjohnlev xpv_panic_putc(int m)
1205084Sjohnlev {
1215084Sjohnlev struct cons_polledio *c = cons_polledio;
1225084Sjohnlev
1235084Sjohnlev /* This really shouldn't happen */
1245084Sjohnlev if (console == CONS_HYPERVISOR)
1255084Sjohnlev return;
1265084Sjohnlev
1275084Sjohnlev if (use_polledio == B_TRUE)
1285084Sjohnlev c->cons_polledio_putchar(c->cons_polledio_argument, m);
1295084Sjohnlev else
1305084Sjohnlev bcons_putchar(m);
1315084Sjohnlev }
1325084Sjohnlev
1335084Sjohnlev static void
xpv_panic_puts(char * msg)1345084Sjohnlev xpv_panic_puts(char *msg)
1355084Sjohnlev {
1365084Sjohnlev char *m;
1375084Sjohnlev
1385084Sjohnlev dump_timeleft = dump_timeout;
1395084Sjohnlev for (m = msg; *m; m++)
1405084Sjohnlev xpv_panic_putc((int)*m);
1415084Sjohnlev }
1425084Sjohnlev
1435084Sjohnlev static void
xpv_panic_console_print(const char * fmt,...)1445084Sjohnlev xpv_panic_console_print(const char *fmt, ...)
1455084Sjohnlev {
1465084Sjohnlev va_list ap;
1475084Sjohnlev
1485084Sjohnlev va_start(ap, fmt);
1495084Sjohnlev (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap);
1505084Sjohnlev va_end(ap);
1515084Sjohnlev
1525084Sjohnlev xpv_panic_puts(console_buffer);
1535084Sjohnlev }
1545084Sjohnlev
1555084Sjohnlev static void
xpv_panic_map(int level,pfn_t pfn)1565084Sjohnlev xpv_panic_map(int level, pfn_t pfn)
1575084Sjohnlev {
1585084Sjohnlev x86pte_t pte, *pteptr;
1595084Sjohnlev
1605084Sjohnlev /*
1615084Sjohnlev * The provided pfn represents a level 'level' page table. Map it
1625084Sjohnlev * into the 'level' slot in the list of page table windows.
1635084Sjohnlev */
1645084Sjohnlev pteptr = (x86pte_t *)PWIN_PTE_VA(level);
1655084Sjohnlev pte = pfn_to_pa(pfn) | PT_VALID;
1665084Sjohnlev
1675084Sjohnlev XPV_ALLOW_PAGETABLE_UPDATES();
1685084Sjohnlev if (mmu.pae_hat)
1695084Sjohnlev *pteptr = pte;
1705084Sjohnlev else
1715084Sjohnlev *(x86pte32_t *)pteptr = pte;
1725084Sjohnlev XPV_DISALLOW_PAGETABLE_UPDATES();
1735084Sjohnlev
1745084Sjohnlev mmu_tlbflush_entry(PWIN_VA(level));
1755084Sjohnlev }
1765084Sjohnlev
1775084Sjohnlev /*
1785084Sjohnlev * Walk the page tables to find the pfn mapped by the given va.
1795084Sjohnlev */
1805084Sjohnlev static pfn_t
xpv_va_walk(uintptr_t * vaddr)1815084Sjohnlev xpv_va_walk(uintptr_t *vaddr)
1825084Sjohnlev {
1835084Sjohnlev int l, idx;
1845084Sjohnlev pfn_t pfn;
1855084Sjohnlev x86pte_t pte;
1865084Sjohnlev x86pte_t *ptep;
1875084Sjohnlev uintptr_t va = *vaddr;
1885084Sjohnlev uintptr_t scan_va;
1895084Sjohnlev caddr_t ptable_window;
1905084Sjohnlev static pfn_t toplevel_pfn;
1915084Sjohnlev static uintptr_t lastva;
1925084Sjohnlev
1935084Sjohnlev /*
1945084Sjohnlev * If we do anything other than a simple scan through memory, don't
1955084Sjohnlev * trust the mapped page tables.
1965084Sjohnlev */
1975084Sjohnlev if (va != lastva + MMU_PAGESIZE)
1985084Sjohnlev for (l = mmu.max_level; l >= 0; l--)
1995084Sjohnlev ptable_pfn[l] = PFN_INVALID;
2005084Sjohnlev
2015084Sjohnlev toplevel_pfn = mmu_btop(xpv_panic_cr3);
2025084Sjohnlev
2035084Sjohnlev while (va < xpv_end && va >= *vaddr) {
2045084Sjohnlev /* Find the lowest table with any entry for va */
2055084Sjohnlev pfn = toplevel_pfn;
2065084Sjohnlev for (l = mmu.max_level; l >= 0; l--) {
2075084Sjohnlev if (ptable_pfn[l] != pfn) {
2085084Sjohnlev xpv_panic_map(l, pfn);
2095084Sjohnlev ptable_pfn[l] = pfn;
2105084Sjohnlev }
2115084Sjohnlev
2125084Sjohnlev /*
2135084Sjohnlev * Search this pagetable for any mapping to an
2145084Sjohnlev * address >= va.
2155084Sjohnlev */
2165084Sjohnlev ptable_window = PWIN_VA(l);
2175084Sjohnlev if (l == mmu.max_level && mmu.pae_hat)
2185084Sjohnlev ptable_window +=
2195084Sjohnlev (xpv_panic_cr3 & MMU_PAGEOFFSET);
2205084Sjohnlev
2215084Sjohnlev idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1);
2225084Sjohnlev scan_va = va;
2235084Sjohnlev while (idx < xpv_panic_nptes[l] && scan_va < xpv_end &&
2245084Sjohnlev scan_va >= *vaddr) {
2255084Sjohnlev ptep = (x86pte_t *)(ptable_window +
2265084Sjohnlev (idx << mmu.pte_size_shift));
2275084Sjohnlev pte = GET_PTE(ptep);
2285084Sjohnlev if (pte & PTE_VALID)
2295084Sjohnlev break;
2305084Sjohnlev idx++;
2315084Sjohnlev scan_va += mmu.level_size[l];
2325084Sjohnlev }
2335084Sjohnlev
2345084Sjohnlev /*
2355084Sjohnlev * If there are no valid mappings in this table, we
2365084Sjohnlev * can skip to the end of the VA range it covers.
2375084Sjohnlev */
2385084Sjohnlev if (idx == xpv_panic_nptes[l]) {
2395084Sjohnlev va = NEXT_ENTRY_VA(va, l + 1);
2405084Sjohnlev break;
2415084Sjohnlev }
2425084Sjohnlev
2436144Srab va = scan_va;
2446144Srab /*
2456144Srab * See if we've hit the end of the range.
2466144Srab */
2476144Srab if (va >= xpv_end || va < *vaddr)
2486144Srab break;
2496144Srab
2505084Sjohnlev /*
2515084Sjohnlev * If this mapping is for a pagetable, we drop down
2525084Sjohnlev * to the next level in the hierarchy and look for
2535084Sjohnlev * a mapping in it.
2545084Sjohnlev */
2555084Sjohnlev pfn = PTE2MFN(pte, l);
2565084Sjohnlev if (!PTE_ISPAGE(pte, l))
2575084Sjohnlev continue;
2585084Sjohnlev
2595084Sjohnlev /*
2605084Sjohnlev * The APIC page is magic. Nothing to see here;
2615084Sjohnlev * move along.
2625084Sjohnlev */
2635084Sjohnlev if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) ==
2645084Sjohnlev (va & MMU_PAGEMASK)) {
2655084Sjohnlev va += MMU_PAGESIZE;
2665084Sjohnlev break;
2675084Sjohnlev }
2685084Sjohnlev
2696144Srab /*
2706144Srab * See if the address is within one of the two
2716144Srab * kpm-like regions we want to skip.
2726144Srab */
2736144Srab if (va >= kpm1_low && va < kpm1_high) {
2746144Srab va = kpm1_high;
2756144Srab break;
2766144Srab }
2776144Srab if (va >= kpm2_low && va < kpm2_high) {
2786144Srab va = kpm2_high;
2795084Sjohnlev break;
2805084Sjohnlev }
2815084Sjohnlev
2825084Sjohnlev /*
2835084Sjohnlev * The Xen panic code only handles small pages. If
2845084Sjohnlev * this mapping is for a large page, we need to
2855084Sjohnlev * identify the consituent page that covers the
2865084Sjohnlev * specific VA we were looking for.
2875084Sjohnlev */
2885084Sjohnlev if (l > 0) {
2895084Sjohnlev if (l > 1)
2905084Sjohnlev panic("Xen panic can't cope with "
2915084Sjohnlev "giant pages.");
2925084Sjohnlev idx = (va >> LEVEL_SHIFT(0)) &
2935084Sjohnlev (xpv_panic_nptes[0] - 1);
2945084Sjohnlev pfn += idx;
2955084Sjohnlev }
2965084Sjohnlev
2975084Sjohnlev *vaddr = va;
2985084Sjohnlev lastva = va;
2995084Sjohnlev return (pfn | PFN_IS_FOREIGN_MFN);
3005084Sjohnlev }
3015084Sjohnlev }
3025084Sjohnlev return (PFN_INVALID);
3035084Sjohnlev }
3045084Sjohnlev
3055084Sjohnlev /*
3065084Sjohnlev * Walk through the Xen VA space, finding pages that are mapped in.
3075084Sjohnlev *
3085084Sjohnlev * These pages all have MFNs rather than PFNs, meaning they may be outside
3095084Sjohnlev * the physical address space the kernel knows about, or they may collide
3105084Sjohnlev * with PFNs the kernel is using.
3115084Sjohnlev *
3125084Sjohnlev * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs
3135084Sjohnlev * to avoid collisions doesn't work. The pages need to be written to disk
3145084Sjohnlev * in PFN-order or savecore gets confused. We can't allocate memory to
3155084Sjohnlev * contruct a sorted pfn->VA reverse mapping, so we have to write the pages
3165084Sjohnlev * to disk in VA order.
3175084Sjohnlev *
3185084Sjohnlev * To square this circle, we simply make up PFNs for each of Xen's pages.
3195084Sjohnlev * We assign each mapped page a fake PFN in ascending order. These fake
3205084Sjohnlev * PFNs each have the FOREIGN bit set, ensuring that they fall outside the
3215084Sjohnlev * range of Solaris PFNs written by the kernel.
3225084Sjohnlev */
3235084Sjohnlev int
dump_xpv_addr()3245084Sjohnlev dump_xpv_addr()
3255084Sjohnlev {
3265084Sjohnlev uintptr_t va;
3275084Sjohnlev mem_vtop_t mem_vtop;
3285084Sjohnlev
3295084Sjohnlev xpv_dump_pages = 0;
3305084Sjohnlev va = xen_virt_start;
3315084Sjohnlev
3325084Sjohnlev while (xpv_va_walk(&va) != PFN_INVALID) {
3335084Sjohnlev mem_vtop.m_as = &kas;
3345084Sjohnlev mem_vtop.m_va = (void *)va;
3355084Sjohnlev mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
3365084Sjohnlev
3375084Sjohnlev dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
3385084Sjohnlev xpv_dump_pages++;
3395084Sjohnlev
3405084Sjohnlev va += MMU_PAGESIZE;
3415084Sjohnlev }
3425084Sjohnlev
3435084Sjohnlev /*
3445084Sjohnlev * Add the shared_info page. This page actually ends up in the
3455084Sjohnlev * dump twice: once for the Xen va and once for the Solaris va.
3465084Sjohnlev * This isn't ideal, but we don't know the address Xen is using for
3475084Sjohnlev * the page, so we can't share it.
3485084Sjohnlev */
3495084Sjohnlev mem_vtop.m_as = &kas;
3505084Sjohnlev mem_vtop.m_va = HYPERVISOR_shared_info;
3515084Sjohnlev mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
3525084Sjohnlev dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
3535084Sjohnlev xpv_dump_pages++;
3545084Sjohnlev
3555084Sjohnlev return (xpv_dump_pages);
3565084Sjohnlev }
3575084Sjohnlev
3585084Sjohnlev void
dump_xpv_pfn()3595084Sjohnlev dump_xpv_pfn()
3605084Sjohnlev {
3615084Sjohnlev pfn_t pfn;
3625084Sjohnlev int cnt;
3635084Sjohnlev
3645084Sjohnlev for (cnt = 0; cnt < xpv_dump_pages; cnt++) {
3655084Sjohnlev pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN;
3665084Sjohnlev dumpvp_write(&pfn, sizeof (pfn));
3675084Sjohnlev }
3685084Sjohnlev }
3695084Sjohnlev
3705084Sjohnlev int
dump_xpv_data(void * dump_cbuf)3715084Sjohnlev dump_xpv_data(void *dump_cbuf)
3725084Sjohnlev {
3735084Sjohnlev uintptr_t va;
3745084Sjohnlev uint32_t csize;
3755084Sjohnlev int cnt = 0;
3765084Sjohnlev
3775084Sjohnlev /*
3785084Sjohnlev * XXX: we should probably run this data through a UE check. The
3795084Sjohnlev * catch is that the UE code relies on on_trap() and getpfnum()
3805084Sjohnlev * working.
3815084Sjohnlev */
3825084Sjohnlev va = xen_virt_start;
3835084Sjohnlev
3845084Sjohnlev while (xpv_va_walk(&va) != PFN_INVALID) {
3855084Sjohnlev csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE);
3865084Sjohnlev dumpvp_write(&csize, sizeof (uint32_t));
3875084Sjohnlev dumpvp_write(dump_cbuf, csize);
3885084Sjohnlev if (dump_ioerr) {
3895084Sjohnlev dumphdr->dump_flags &= ~DF_COMPLETE;
3905084Sjohnlev return (cnt);
3915084Sjohnlev }
3925084Sjohnlev cnt++;
3935084Sjohnlev va += MMU_PAGESIZE;
3945084Sjohnlev }
3955084Sjohnlev
3965084Sjohnlev /*
3975084Sjohnlev * Finally, dump the shared_info page
3985084Sjohnlev */
3995084Sjohnlev csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf,
4005084Sjohnlev PAGESIZE);
4015084Sjohnlev dumpvp_write(&csize, sizeof (uint32_t));
4025084Sjohnlev dumpvp_write(dump_cbuf, csize);
4035084Sjohnlev if (dump_ioerr)
4045084Sjohnlev dumphdr->dump_flags &= ~DF_COMPLETE;
4055084Sjohnlev cnt++;
4065084Sjohnlev
4075084Sjohnlev return (cnt);
4085084Sjohnlev }
4095084Sjohnlev
4105084Sjohnlev static void *
showstack(void * fpreg,int xpv_only)4115084Sjohnlev showstack(void *fpreg, int xpv_only)
4125084Sjohnlev {
4135084Sjohnlev struct frame *fpp;
4145084Sjohnlev ulong_t off;
4155084Sjohnlev char *sym;
4165084Sjohnlev uintptr_t pc, fp, lastfp;
4175084Sjohnlev uintptr_t minaddr = min(KERNELBASE, xen_virt_start);
4185084Sjohnlev
4195084Sjohnlev fp = (uintptr_t)fpreg;
4205084Sjohnlev if (fp < minaddr) {
4215084Sjohnlev xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg);
4225084Sjohnlev return (fpreg);
4235084Sjohnlev }
4245084Sjohnlev
4255084Sjohnlev do {
4265084Sjohnlev fpp = (struct frame *)fp;
4275084Sjohnlev pc = fpp->fr_savpc;
4285084Sjohnlev
4295084Sjohnlev if ((xpv_only != 0) &&
4305084Sjohnlev (fp > xpv_end || fp < xen_virt_start))
4315084Sjohnlev break;
4325084Sjohnlev if ((sym = kobj_getsymname(pc, &off)) != NULL)
4335084Sjohnlev xpv_panic_printf("%08lx %s:%s+%lx\n", fp,
4345084Sjohnlev mod_containing_pc((caddr_t)pc), sym, off);
4355084Sjohnlev else if ((pc >= xen_virt_start) && (pc <= xpv_end))
4365084Sjohnlev xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc);
4375084Sjohnlev else
4385084Sjohnlev xpv_panic_printf("%08lx %lx\n", fp, pc);
4395084Sjohnlev
4405084Sjohnlev lastfp = fp;
4415084Sjohnlev fp = fpp->fr_savfp;
4425084Sjohnlev
4435084Sjohnlev /*
4445084Sjohnlev * Xen marks an exception frame by inverting the frame
4455084Sjohnlev * pointer.
4465084Sjohnlev */
4475084Sjohnlev if (fp < lastfp) {
4485084Sjohnlev if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff)
4495084Sjohnlev fp = ~fp;
4505084Sjohnlev }
4515084Sjohnlev } while (fp > lastfp);
4525084Sjohnlev return ((void *)fp);
4535084Sjohnlev }
4545084Sjohnlev
4555084Sjohnlev void *
xpv_traceback(void * fpreg)4565084Sjohnlev xpv_traceback(void *fpreg)
4575084Sjohnlev {
4585084Sjohnlev return (showstack(fpreg, 1));
4595084Sjohnlev }
4605084Sjohnlev
4615084Sjohnlev #if defined(__amd64)
4625084Sjohnlev static void
xpv_panic_hypercall(ulong_t call)4635084Sjohnlev xpv_panic_hypercall(ulong_t call)
4645084Sjohnlev {
4655084Sjohnlev panic("Illegally issued hypercall %d during panic!\n", (int)call);
4665084Sjohnlev }
4675084Sjohnlev #endif
4685084Sjohnlev
4695084Sjohnlev void
xpv_die(struct regs * rp)4705084Sjohnlev xpv_die(struct regs *rp)
4715084Sjohnlev {
4725084Sjohnlev struct panic_trap_info ti;
4735084Sjohnlev struct cregs creg;
4745084Sjohnlev
4755084Sjohnlev ti.trap_regs = rp;
4765084Sjohnlev ti.trap_type = rp->r_trapno;
4775084Sjohnlev
4785084Sjohnlev curthread->t_panic_trap = &ti;
4795084Sjohnlev if (ti.trap_type == T_PGFLT) {
4805084Sjohnlev getcregs(&creg);
4815084Sjohnlev ti.trap_addr = (caddr_t)creg.cr_cr2;
4825084Sjohnlev panic("Fatal pagefault at 0x%lx. fault addr=0x%p rp=0x%p",
4837240Srh87107 rp->r_pc, (void *)ti.trap_addr, (void *)rp);
4845084Sjohnlev } else {
4855084Sjohnlev ti.trap_addr = (caddr_t)rp->r_pc;
4865084Sjohnlev panic("Fatal trap %ld at 0x%lx. rp=0x%p", rp->r_trapno,
4877240Srh87107 rp->r_pc, (void *)rp);
4885084Sjohnlev }
4895084Sjohnlev }
4905084Sjohnlev
4915084Sjohnlev /*
4925084Sjohnlev * Build IDT to handle a Xen panic
4935084Sjohnlev */
4945084Sjohnlev static void
switch_to_xpv_panic_idt()4955084Sjohnlev switch_to_xpv_panic_idt()
4965084Sjohnlev {
4975084Sjohnlev int i;
4985084Sjohnlev desctbr_t idtr;
4995084Sjohnlev gate_desc_t *idt = xpv_panic_idt;
5005084Sjohnlev selector_t cs = get_cs_register();
5015084Sjohnlev
5025084Sjohnlev for (i = 0; i < 32; i++)
5038679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL,
5048679SSeth.Goldberg@Sun.COM 0);
5055084Sjohnlev
5068679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL,
5078679SSeth.Goldberg@Sun.COM 0);
5088679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
5098679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0);
5105084Sjohnlev set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT,
5118679SSeth.Goldberg@Sun.COM TRP_XPL, 0);
5128679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL,
5138679SSeth.Goldberg@Sun.COM 0);
5148679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL,
5158679SSeth.Goldberg@Sun.COM 0);
5168679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL,
5178679SSeth.Goldberg@Sun.COM 0);
5188679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL,
5198679SSeth.Goldberg@Sun.COM 0);
5208679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0);
5218679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0);
5228679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0);
5238679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL,
5248679SSeth.Goldberg@Sun.COM 0);
5258679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL,
5268679SSeth.Goldberg@Sun.COM 0);
5278679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0);
5288679SSeth.Goldberg@Sun.COM set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
5295084Sjohnlev
5305084Sjohnlev /*
5315084Sjohnlev * We have no double fault handler. Any single fault represents a
5325084Sjohnlev * catastrophic failure for us, so there is no attempt to handle
5335084Sjohnlev * them cleanly: we just print a message and reboot. If we
5345084Sjohnlev * encounter a second fault while doing that, there is nothing
5355084Sjohnlev * else we can do.
5365084Sjohnlev */
5375084Sjohnlev
5385084Sjohnlev /*
5395084Sjohnlev * Be prepared to absorb any stray device interrupts received
5405084Sjohnlev * while writing the core to disk.
5415084Sjohnlev */
5425084Sjohnlev for (i = 33; i < NIDT; i++)
5435084Sjohnlev set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT,
5448679SSeth.Goldberg@Sun.COM TRP_XPL, 0);
5455084Sjohnlev
5465084Sjohnlev /* The one interrupt we expect to get is from the APIC timer. */
5475084Sjohnlev set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT,
5488679SSeth.Goldberg@Sun.COM TRP_XPL, 0);
5495084Sjohnlev
5505084Sjohnlev idtr.dtr_base = (uintptr_t)xpv_panic_idt;
5515084Sjohnlev idtr.dtr_limit = sizeof (xpv_panic_idt) - 1;
5525084Sjohnlev wr_idtr(&idtr);
5535084Sjohnlev
5545084Sjohnlev #if defined(__amd64)
5555084Sjohnlev /* Catch any hypercalls. */
5565084Sjohnlev wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall);
5575084Sjohnlev wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall);
5585084Sjohnlev #endif
5595084Sjohnlev }
5605084Sjohnlev
5615084Sjohnlev static void
xpv_apic_clkinit()5625084Sjohnlev xpv_apic_clkinit()
5635084Sjohnlev {
5645084Sjohnlev uint_t apic_ticks = 0;
5655084Sjohnlev
5665084Sjohnlev /*
5675084Sjohnlev * Measure how many APIC ticks there are within a fixed time
5685084Sjohnlev * period. We're going to be fairly coarse here. This timer is
5695084Sjohnlev * just being used to detect a stalled panic, so as long as we have
5705084Sjohnlev * the right order of magnitude, everything should be fine.
5715084Sjohnlev */
5725084Sjohnlev xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR;
5735084Sjohnlev xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK;
5745084Sjohnlev xpv_apicadr[APIC_INT_VECT0] = AV_MASK; /* local intr reg 0 */
5755084Sjohnlev
5765084Sjohnlev xpv_apicadr[APIC_DIVIDE_REG] = 0;
5775084Sjohnlev xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL;
5785084Sjohnlev drv_usecwait(XPV_TIMER_INTERVAL);
5795084Sjohnlev apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT];
5805084Sjohnlev
5815084Sjohnlev /*
5825084Sjohnlev * apic_ticks now represents roughly how many apic ticks comprise
5835084Sjohnlev * one timeout interval. Program the timer to send us an interrupt
5845084Sjohnlev * every time that interval expires.
5855084Sjohnlev */
586*13029SKrishnendu.Sadhukhan@Sun.COM xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_PERIODIC;
5875084Sjohnlev xpv_apicadr[APIC_INIT_COUNT] = apic_ticks;
5885084Sjohnlev xpv_apicadr[APIC_EOI_REG] = 0;
5895084Sjohnlev }
5905084Sjohnlev
5915084Sjohnlev void
xpv_timer_tick(void)5925084Sjohnlev xpv_timer_tick(void)
5935084Sjohnlev {
5945084Sjohnlev static int ticks = 0;
5955084Sjohnlev
5965084Sjohnlev if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) {
5975084Sjohnlev ticks = 0;
5985084Sjohnlev if (dump_timeleft && (--dump_timeleft == 0))
5995084Sjohnlev panic("Xen panic timeout\n");
6005084Sjohnlev }
6015084Sjohnlev xpv_apicadr[APIC_EOI_REG] = 0;
6025084Sjohnlev }
6035084Sjohnlev
6045084Sjohnlev void
xpv_interrupt(void)6055084Sjohnlev xpv_interrupt(void)
6065084Sjohnlev {
6075084Sjohnlev #ifdef DEBUG
6085084Sjohnlev static int cnt = 0;
6095084Sjohnlev
6105084Sjohnlev if (cnt++ < 10)
6115084Sjohnlev xpv_panic_printf("Unexpected interrupt received.\n");
6125084Sjohnlev if ((cnt < 1000) && ((cnt % 100) == 0))
6135084Sjohnlev xpv_panic_printf("%d unexpected interrupts received.\n", cnt);
6145084Sjohnlev #endif
6155084Sjohnlev
6165084Sjohnlev xpv_apicadr[APIC_EOI_REG] = 0;
6175084Sjohnlev }
6185084Sjohnlev
6195084Sjohnlev /*
6205084Sjohnlev * Managing time in panic context is trivial. We only have a single CPU,
6215084Sjohnlev * we never get rescheduled, we never get suspended. We just need to
6225084Sjohnlev * convert clock ticks into nanoseconds.
6235084Sjohnlev */
6245084Sjohnlev static hrtime_t
xpv_panic_gethrtime(void)6255084Sjohnlev xpv_panic_gethrtime(void)
6265084Sjohnlev {
6275084Sjohnlev hrtime_t tsc, hrt;
6285084Sjohnlev unsigned int *l = (unsigned int *)&(tsc);
6295084Sjohnlev
6305084Sjohnlev tsc = __rdtsc_insn();
6315084Sjohnlev hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) +
6325084Sjohnlev (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT));
6335084Sjohnlev
6345084Sjohnlev return (hrt);
6355084Sjohnlev }
6365084Sjohnlev
6375084Sjohnlev static void
xpv_panic_time_init()6385084Sjohnlev xpv_panic_time_init()
6395084Sjohnlev {
6405084Sjohnlev nsec_scale =
6415084Sjohnlev CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT;
6425084Sjohnlev
6435084Sjohnlev gethrtimef = xpv_panic_gethrtime;
6445084Sjohnlev }
6455084Sjohnlev
6465084Sjohnlev static void
xpv_panicsys(struct regs * rp,char * fmt,...)6475084Sjohnlev xpv_panicsys(struct regs *rp, char *fmt, ...)
6485084Sjohnlev {
6495084Sjohnlev extern void panicsys(const char *, va_list, struct regs *, int);
6505084Sjohnlev va_list alist;
6515084Sjohnlev
6525084Sjohnlev va_start(alist, fmt);
6535084Sjohnlev panicsys(fmt, alist, rp, 1);
6545084Sjohnlev va_end(alist);
6555084Sjohnlev }
6565084Sjohnlev
6575084Sjohnlev void
xpv_do_panic(void * arg)6585084Sjohnlev xpv_do_panic(void *arg)
6595084Sjohnlev {
6605084Sjohnlev struct panic_info *pip = (struct panic_info *)arg;
6615084Sjohnlev int l;
6625084Sjohnlev struct cregs creg;
6635084Sjohnlev #if defined(__amd64)
6645084Sjohnlev extern uintptr_t postbootkernelbase;
6655084Sjohnlev #endif
6665084Sjohnlev
6675084Sjohnlev if (xpv_panicking++ > 0)
6685084Sjohnlev panic("multiple calls to xpv_do_panic()");
6695084Sjohnlev
6705084Sjohnlev /*
6715084Sjohnlev * Indicate to the underlying panic framework that a panic has been
6725084Sjohnlev * initiated. This is ordinarily done as part of vpanic(). Since
6735084Sjohnlev * we already have all the register state saved by the hypervisor,
6745084Sjohnlev * we skip that and jump straight into the panic processing code.
6757532SSean.Ye@Sun.COM *
6767532SSean.Ye@Sun.COM * XXX If another thread grabs and wins the panic_quiesce trigger
6777532SSean.Ye@Sun.COM * then we'll have two threads in panicsys believing they are in
6787532SSean.Ye@Sun.COM * charge of the panic attempt!
6795084Sjohnlev */
6805084Sjohnlev (void) panic_trigger(&panic_quiesce);
6815084Sjohnlev
6825084Sjohnlev #if defined(__amd64)
6835084Sjohnlev /*
6845084Sjohnlev * bzero() and bcopy() get unhappy when asked to operate on
6855084Sjohnlev * addresses outside of the kernel. At this point Xen is really a
6865084Sjohnlev * part of the kernel, so we update the routines' notion of where
6875084Sjohnlev * the kernel starts.
6885084Sjohnlev */
6895084Sjohnlev postbootkernelbase = xen_virt_start;
6905084Sjohnlev #endif
6915084Sjohnlev
6925084Sjohnlev #if defined(HYPERVISOR_VIRT_END)
6935084Sjohnlev xpv_end = HYPERVISOR_VIRT_END;
6945084Sjohnlev #else
6955084Sjohnlev xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t);
6965084Sjohnlev #endif
6975084Sjohnlev
6985084Sjohnlev /*
6995084Sjohnlev * If we were redirecting console output to the hypervisor, we have
7005084Sjohnlev * to stop.
7015084Sjohnlev */
7025084Sjohnlev use_polledio = B_FALSE;
7035084Sjohnlev if (console == CONS_HYPERVISOR) {
7045084Sjohnlev bcons_device_change(CONS_HYPERVISOR);
7055084Sjohnlev } else if (cons_polledio != NULL &&
7065084Sjohnlev cons_polledio->cons_polledio_putchar != NULL) {
7075084Sjohnlev if (cons_polledio->cons_polledio_enter != NULL)
7085084Sjohnlev cons_polledio->cons_polledio_enter(
7095084Sjohnlev cons_polledio->cons_polledio_argument);
7105084Sjohnlev use_polledio = 1;
7115084Sjohnlev }
7125084Sjohnlev
7135084Sjohnlev /* Make sure we handle all console output from here on. */
7145084Sjohnlev sysp->bsvc_putchar = xpv_panic_putc;
7155084Sjohnlev
7165084Sjohnlev /*
7175084Sjohnlev * If we find an unsupported panic_info structure, there's not much
7185084Sjohnlev * we can do other than complain, plow on, and hope for the best.
7195084Sjohnlev */
7205084Sjohnlev if (pip->pi_version != PANIC_INFO_VERSION)
7215084Sjohnlev xpv_panic_printf("Warning: Xen is using an unsupported "
7225084Sjohnlev "version of the panic_info structure.\n");
7235084Sjohnlev
7245084Sjohnlev xpv_panic_info = pip;
7255084Sjohnlev
7266144Srab #if defined(__amd64)
7276144Srab kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start;
7286144Srab if (xpv_panic_info->pi_xen_start == NULL) {
7296144Srab kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end;
7306144Srab } else {
7316144Srab kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start;
7326144Srab kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end;
7336144Srab kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end;
7346144Srab }
7356144Srab #endif
7366144Srab
7375084Sjohnlev /*
7385084Sjohnlev * Make sure we are running on the Solaris %gs. The Xen panic code
7395084Sjohnlev * should already have set up the GDT properly.
7405084Sjohnlev */
7415084Sjohnlev xpv_panic_resetgs();
7425084Sjohnlev #if defined(__amd64)
7435084Sjohnlev wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
7445084Sjohnlev #endif
7455084Sjohnlev
7465084Sjohnlev xpv_panic_time_init();
7475084Sjohnlev
7485084Sjohnlev /*
7495084Sjohnlev * Switch to our own IDT, avoiding any accidental returns to Xen
7505084Sjohnlev * world.
7515084Sjohnlev */
7525084Sjohnlev switch_to_xpv_panic_idt();
7535084Sjohnlev
7545084Sjohnlev /*
7555084Sjohnlev * Initialize the APIC timer, which is used to detect a hung dump
7565084Sjohnlev * attempt.
7575084Sjohnlev */
7585084Sjohnlev xpv_apicadr = pip->pi_apic;
7595084Sjohnlev xpv_apic_clkinit();
7605084Sjohnlev
7615084Sjohnlev /*
7625084Sjohnlev * Set up a few values that we'll need repeatedly.
7635084Sjohnlev */
7645084Sjohnlev getcregs(&creg);
7655084Sjohnlev xpv_panic_cr3 = creg.cr_cr3;
7665084Sjohnlev for (l = mmu.max_level; l >= 0; l--)
7675084Sjohnlev xpv_panic_nptes[l] = mmu.ptes_per_table;
7685084Sjohnlev #ifdef __i386
7695084Sjohnlev if (mmu.pae_hat)
7705084Sjohnlev xpv_panic_nptes[mmu.max_level] = 4;
7715084Sjohnlev #endif
7725084Sjohnlev
7735084Sjohnlev /* Add the fake Xen module to the module list */
7745084Sjohnlev if (xpv_module != NULL) {
7755084Sjohnlev extern int last_module_id;
7765084Sjohnlev
7775084Sjohnlev xpv_modctl->mod_id = last_module_id++;
7785084Sjohnlev xpv_modctl->mod_next = &modules;
7795084Sjohnlev xpv_modctl->mod_prev = modules.mod_prev;
7805084Sjohnlev modules.mod_prev->mod_next = xpv_modctl;
7815084Sjohnlev modules.mod_prev = xpv_modctl;
7825084Sjohnlev }
7837532SSean.Ye@Sun.COM
7847532SSean.Ye@Sun.COM if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC)
7857532SSean.Ye@Sun.COM xpv_mca_panic_data = &pip->pi_mca;
7867532SSean.Ye@Sun.COM
7875084Sjohnlev xpv_panic_printf = printf;
7885084Sjohnlev xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr);
7895084Sjohnlev xpv_panic_printf("Failed to reboot following panic.\n");
7905084Sjohnlev for (;;)
7915084Sjohnlev ;
7925084Sjohnlev }
7935084Sjohnlev
7945084Sjohnlev /*
7955084Sjohnlev * Set up the necessary data structures to pretend that the Xen hypervisor
7965084Sjohnlev * is a loadable module, allowing mdb to find the Xen symbols in a crash
7975084Sjohnlev * dump. Since these symbols all map to VA space Solaris doesn't normally
7985084Sjohnlev * have access to, we don't link these structures into the kernel's lists
7995084Sjohnlev * until/unless we hit a Xen panic.
8005084Sjohnlev *
8015084Sjohnlev * The observant reader will note a striking amount of overlap between this
8025084Sjohnlev * code and that found in krtld. While it would be handy if we could just
8035084Sjohnlev * ask krtld to do this work for us, it's not that simple. Among the
8045084Sjohnlev * complications: we're not actually loading the text here (grub did it at
8055084Sjohnlev * boot), the .text section is writable, there are no relocations to do,
8065084Sjohnlev * none of the module text/data is in readable memory, etc. Training krtld
8075084Sjohnlev * to deal with this weird module is as complicated, and more risky, than
8085084Sjohnlev * reimplementing the necessary subset of it here.
8095084Sjohnlev */
8105084Sjohnlev static void
init_xen_module()8115084Sjohnlev init_xen_module()
8125084Sjohnlev {
8135084Sjohnlev struct _buf *file = NULL;
8145084Sjohnlev struct module *mp;
8155084Sjohnlev struct modctl *mcp;
8165084Sjohnlev int i, shn;
8175084Sjohnlev Shdr *shp, *ctf_shp;
8185084Sjohnlev char *names = NULL;
8195084Sjohnlev size_t n, namesize, text_align, data_align;
8205084Sjohnlev #if defined(__amd64)
8215084Sjohnlev const char machine = EM_AMD64;
8225084Sjohnlev #else
8235084Sjohnlev const char machine = EM_386;
8245084Sjohnlev #endif
8255084Sjohnlev
8265084Sjohnlev /* Allocate and init the module structure */
8275084Sjohnlev mp = kmem_zalloc(sizeof (*mp), KM_SLEEP);
8285084Sjohnlev mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
8295084Sjohnlev (void) strcpy(mp->filename, XPV_FILENAME);
8305084Sjohnlev
8315084Sjohnlev /* Allocate and init the modctl structure */
8325084Sjohnlev mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP);
8335084Sjohnlev mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP);
8345084Sjohnlev (void) strcpy(mcp->mod_modname, XPV_MODNAME);
8355084Sjohnlev mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
8365084Sjohnlev (void) strcpy(mcp->mod_filename, XPV_FILENAME);
8375084Sjohnlev mcp->mod_inprogress_thread = (kthread_id_t)-1;
8385084Sjohnlev mcp->mod_ref = 1;
8395084Sjohnlev mcp->mod_loaded = 1;
8405084Sjohnlev mcp->mod_loadcnt = 1;
8415084Sjohnlev mcp->mod_mp = mp;
8425084Sjohnlev
8435084Sjohnlev /*
8445084Sjohnlev * Try to open a Xen image that hasn't had its symbol and CTF
8455084Sjohnlev * information stripped off.
8465084Sjohnlev */
8475084Sjohnlev file = kobj_open_file(XPV_FILENAME);
8485084Sjohnlev if (file == (struct _buf *)-1) {
8495084Sjohnlev file = NULL;
8505084Sjohnlev goto err;
8515084Sjohnlev }
8525084Sjohnlev
8535084Sjohnlev /*
8545084Sjohnlev * Read the header and ensure that this is an ELF file for the
8555084Sjohnlev * proper ISA. If it's not, somebody has done something very
8565084Sjohnlev * stupid. Why bother? See Mencken.
8575084Sjohnlev */
8585084Sjohnlev if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0)
8595084Sjohnlev goto err;
8605084Sjohnlev for (i = 0; i < SELFMAG; i++)
8615084Sjohnlev if (mp->hdr.e_ident[i] != ELFMAG[i])
8625084Sjohnlev goto err;
8635084Sjohnlev if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) ||
8645084Sjohnlev (mp->hdr.e_machine != machine))
8655084Sjohnlev goto err;
8665084Sjohnlev
8675084Sjohnlev /* Read in the section headers */
8685084Sjohnlev n = mp->hdr.e_shentsize * mp->hdr.e_shnum;
8695084Sjohnlev mp->shdrs = kmem_zalloc(n, KM_SLEEP);
8705084Sjohnlev if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0)
8715084Sjohnlev goto err;
8725084Sjohnlev
8735084Sjohnlev /* Read the section names */
8745084Sjohnlev shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize);
8755084Sjohnlev namesize = shp->sh_size;
8765084Sjohnlev names = kmem_zalloc(shp->sh_size, KM_SLEEP);
8775084Sjohnlev if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0)
8785084Sjohnlev goto err;
8795084Sjohnlev
8805084Sjohnlev /*
8815084Sjohnlev * Fill in the text and data size fields.
8825084Sjohnlev */
8835084Sjohnlev ctf_shp = NULL;
8845084Sjohnlev text_align = data_align = 0;
8855084Sjohnlev for (shn = 1; shn < mp->hdr.e_shnum; shn++) {
8865084Sjohnlev shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize);
8875084Sjohnlev
8885084Sjohnlev /* Sanity check the offset of the section name */
8895084Sjohnlev if (shp->sh_name >= namesize)
8905084Sjohnlev continue;
8915084Sjohnlev
8925084Sjohnlev /* If we find the symtab section, remember it for later. */
8935084Sjohnlev if (shp->sh_type == SHT_SYMTAB) {
8945084Sjohnlev mp->symtbl_section = shn;
8955084Sjohnlev mp->symhdr = shp;
8965084Sjohnlev continue;
8975084Sjohnlev }
8985084Sjohnlev
8995084Sjohnlev /* If we find the CTF section, remember it for later. */
9005084Sjohnlev if ((shp->sh_size != 0) &&
9015084Sjohnlev (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) {
9025084Sjohnlev ctf_shp = shp;
9035084Sjohnlev continue;
9045084Sjohnlev }
9055084Sjohnlev
9065084Sjohnlev if (!(shp->sh_flags & SHF_ALLOC))
9075084Sjohnlev continue;
9085084Sjohnlev
9095084Sjohnlev /*
9105084Sjohnlev * Xen marks its text section as writable, so we need to
9115084Sjohnlev * look for the name - not just the flag.
9125084Sjohnlev */
9135084Sjohnlev if ((strcmp(&names[shp->sh_name], ".text") != NULL) &&
9145084Sjohnlev (shp->sh_flags & SHF_WRITE) != 0) {
9155084Sjohnlev if (shp->sh_addralign > data_align)
9165084Sjohnlev data_align = shp->sh_addralign;
9175084Sjohnlev mp->data_size = ALIGN(mp->data_size, data_align);
9185084Sjohnlev mp->data_size += ALIGN(shp->sh_size, 8);
9195084Sjohnlev if (mp->data == NULL || mp->data > (char *)shp->sh_addr)
9205084Sjohnlev mp->data = (char *)shp->sh_addr;
9215084Sjohnlev } else {
9225084Sjohnlev if (shp->sh_addralign > text_align)
9235084Sjohnlev text_align = shp->sh_addralign;
9245084Sjohnlev mp->text_size = ALIGN(mp->text_size, text_align);
9255084Sjohnlev mp->text_size += ALIGN(shp->sh_size, 8);
9265084Sjohnlev if (mp->text == NULL || mp->text > (char *)shp->sh_addr)
9275084Sjohnlev mp->text = (char *)shp->sh_addr;
9285084Sjohnlev }
9295084Sjohnlev }
9305084Sjohnlev kmem_free(names, namesize);
9315084Sjohnlev names = NULL;
9325249Snn35248 shp = NULL;
9335084Sjohnlev mcp->mod_text = mp->text;
9345084Sjohnlev mcp->mod_text_size = mp->text_size;
9355084Sjohnlev
9365084Sjohnlev /*
9375084Sjohnlev * If we have symbol table and string table sections, read them in
9385084Sjohnlev * now. If we don't, we just plow on. We'll still get a valid
9395084Sjohnlev * core dump, but finding anything useful will be just a bit
9405084Sjohnlev * harder.
9415084Sjohnlev *
9425084Sjohnlev * Note: we don't bother with a hash table. We'll never do a
9435084Sjohnlev * symbol lookup unless we crash, and then mdb creates its own. We
9445084Sjohnlev * also don't try to perform any relocations. Xen should be loaded
9455084Sjohnlev * exactly where the ELF file indicates, and the symbol information
9465084Sjohnlev * in the file should be complete and correct already. Static
9475084Sjohnlev * linking ain't all bad.
9485084Sjohnlev */
9495084Sjohnlev if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) {
9505084Sjohnlev mp->strhdr = (Shdr *)
9515084Sjohnlev (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize);
9525084Sjohnlev mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize;
9535084Sjohnlev
9545084Sjohnlev /* Allocate space for the symbol table and strings. */
9555084Sjohnlev mp->symsize = mp->symhdr->sh_size +
9565084Sjohnlev mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size;
9575084Sjohnlev mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP);
9585084Sjohnlev mp->symtbl = mp->symspace;
9595084Sjohnlev mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size);
9605084Sjohnlev
9615084Sjohnlev if ((kobj_read_file(file, mp->symtbl,
9625084Sjohnlev mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) ||
9635084Sjohnlev (kobj_read_file(file, mp->strings,
9645084Sjohnlev mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0))
9655084Sjohnlev goto err;
9665084Sjohnlev }
9675084Sjohnlev
9685084Sjohnlev /*
9695084Sjohnlev * Read in the CTF section
9705084Sjohnlev */
9715084Sjohnlev if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) {
9725249Snn35248 mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP);
9735084Sjohnlev mp->ctfsize = ctf_shp->sh_size;
9745084Sjohnlev if (kobj_read_file(file, mp->ctfdata, mp->ctfsize,
9755084Sjohnlev ctf_shp->sh_offset) < 0)
9765084Sjohnlev goto err;
9775084Sjohnlev }
9785084Sjohnlev
9795084Sjohnlev kobj_close_file(file);
9805084Sjohnlev
9815084Sjohnlev xpv_module = mp;
9825084Sjohnlev xpv_modctl = mcp;
9835084Sjohnlev return;
9845084Sjohnlev
9855084Sjohnlev err:
9865084Sjohnlev cmn_err(CE_WARN, "Failed to initialize xpv module.");
9875084Sjohnlev if (file != NULL)
9885084Sjohnlev kobj_close_file(file);
9895084Sjohnlev
9905084Sjohnlev kmem_free(mp->filename, strlen(XPV_FILENAME) + 1);
9915084Sjohnlev if (mp->shdrs != NULL)
9925084Sjohnlev kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum);
9935084Sjohnlev if (mp->symspace != NULL)
9945084Sjohnlev kmem_free(mp->symspace, mp->symsize);
9955084Sjohnlev if (mp->ctfdata != NULL)
9965084Sjohnlev kmem_free(mp->ctfdata, mp->ctfsize);
9975084Sjohnlev kmem_free(mp, sizeof (*mp));
9985084Sjohnlev kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1);
9995084Sjohnlev kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1);
10005084Sjohnlev kmem_free(mcp, sizeof (*mcp));
10015084Sjohnlev if (names != NULL)
10025084Sjohnlev kmem_free(names, namesize);
10035084Sjohnlev }
10045084Sjohnlev
10055084Sjohnlev void
xpv_panic_init()10065084Sjohnlev xpv_panic_init()
10075084Sjohnlev {
10085084Sjohnlev xen_platform_op_t op;
10095084Sjohnlev int i;
10105084Sjohnlev
10115084Sjohnlev ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
10125084Sjohnlev
10135084Sjohnlev for (i = 0; i < mmu.num_level; i++)
10145084Sjohnlev ptable_pfn[i] = PFN_INVALID;
10155084Sjohnlev
10165084Sjohnlev /* Let Xen know where to jump if/when it panics. */
10175084Sjohnlev op.cmd = XENPF_panic_init;
10185084Sjohnlev op.interface_version = XENPF_INTERFACE_VERSION;
10195084Sjohnlev op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr;
10205084Sjohnlev
10215084Sjohnlev (void) HYPERVISOR_platform_op(&op);
10225084Sjohnlev
10235084Sjohnlev init_xen_module();
10245084Sjohnlev }
1025