xref: /onnv-gate/usr/src/uts/i86xpv/os/xpv_panic.c (revision 7240:c4957ab6a78e)
15084Sjohnlev /*
25084Sjohnlev  * CDDL HEADER START
35084Sjohnlev  *
45084Sjohnlev  * The contents of this file are subject to the terms of the
55084Sjohnlev  * Common Development and Distribution License (the "License").
65084Sjohnlev  * You may not use this file except in compliance with the License.
75084Sjohnlev  *
85084Sjohnlev  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
95084Sjohnlev  * or http://www.opensolaris.org/os/licensing.
105084Sjohnlev  * See the License for the specific language governing permissions
115084Sjohnlev  * and limitations under the License.
125084Sjohnlev  *
135084Sjohnlev  * When distributing Covered Code, include this CDDL HEADER in each
145084Sjohnlev  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
155084Sjohnlev  * If applicable, add the following below this CDDL HEADER, with the
165084Sjohnlev  * fields enclosed by brackets "[]" replaced with your own identifying
175084Sjohnlev  * information: Portions Copyright [yyyy] [name of copyright owner]
185084Sjohnlev  *
195084Sjohnlev  * CDDL HEADER END
205084Sjohnlev  */
215084Sjohnlev 
225084Sjohnlev /*
236144Srab  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
245084Sjohnlev  * Use is subject to license terms.
255084Sjohnlev  */
265084Sjohnlev 
275084Sjohnlev #pragma ident	"%Z%%M%	%I%	%E% SMI"
285084Sjohnlev 
295084Sjohnlev #include <sys/types.h>
305084Sjohnlev #include <sys/clock.h>
315084Sjohnlev #include <sys/psm.h>
325084Sjohnlev #include <sys/archsystm.h>
335084Sjohnlev #include <sys/machsystm.h>
345084Sjohnlev #include <sys/compress.h>
355084Sjohnlev #include <sys/modctl.h>
365084Sjohnlev #include <sys/trap.h>
375084Sjohnlev #include <sys/panic.h>
385084Sjohnlev #include <sys/regset.h>
395084Sjohnlev #include <sys/frame.h>
405084Sjohnlev #include <sys/kobj.h>
415084Sjohnlev #include <sys/apic.h>
425084Sjohnlev #include <sys/dumphdr.h>
435084Sjohnlev #include <sys/mem.h>
445084Sjohnlev #include <sys/x86_archext.h>
455084Sjohnlev #include <sys/xpv_panic.h>
465084Sjohnlev #include <sys/boot_console.h>
475084Sjohnlev #include <sys/bootsvcs.h>
485084Sjohnlev #include <sys/consdev.h>
495084Sjohnlev #include <vm/hat_pte.h>
505084Sjohnlev #include <vm/hat_i86.h>
515084Sjohnlev 
525084Sjohnlev /* XXX: need to add a PAE version too, if we ever support both PAE and non */
535084Sjohnlev #if defined(__i386)
545084Sjohnlev #define	XPV_FILENAME	"/boot/xen-syms"
555084Sjohnlev #else
565084Sjohnlev #define	XPV_FILENAME	"/boot/amd64/xen-syms"
575084Sjohnlev #endif
585084Sjohnlev #define	XPV_MODNAME	"xpv"
595084Sjohnlev 
605084Sjohnlev int xpv_panicking = 0;
615084Sjohnlev 
625084Sjohnlev struct module *xpv_module;
635084Sjohnlev struct modctl *xpv_modctl;
645084Sjohnlev 
655084Sjohnlev #define	ALIGN(x, a)	((a) == 0 ? (uintptr_t)(x) : \
665084Sjohnlev 	(((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l)))
675084Sjohnlev 
685084Sjohnlev /* Pointer to the xpv_panic_info structure handed to us by Xen.  */
695084Sjohnlev static struct panic_info *xpv_panic_info = NULL;
705084Sjohnlev 
715084Sjohnlev /* Timer support */
725084Sjohnlev #define	NSEC_SHIFT 5
735084Sjohnlev #define	T_XPV_TIMER	0xd1
745084Sjohnlev #define	XPV_TIMER_INTERVAL	1000	/* 1000 microseconds */
755084Sjohnlev static uint32_t *xpv_apicadr = NULL;
765084Sjohnlev static uint_t	nsec_scale;
775084Sjohnlev 
785084Sjohnlev /* IDT support */
795084Sjohnlev #pragma	align	16(xpv_panic_idt)
805084Sjohnlev static gate_desc_t	xpv_panic_idt[NIDT];	/* interrupt descriptor table */
815084Sjohnlev 
825084Sjohnlev /* Xen pagetables mapped into our HAT's ptable windows */
835084Sjohnlev static pfn_t ptable_pfn[MAX_NUM_LEVEL];
845084Sjohnlev 
855084Sjohnlev /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */
865084Sjohnlev static int xpv_dump_pages;
875084Sjohnlev 
885084Sjohnlev /*
896144Srab  * There are up to two large swathes of RAM that we don't want to include
906144Srab  * in the dump: those that comprise the Xen version of segkpm.  On 32-bit
916144Srab  * systems there is no such region of memory.  On 64-bit systems, there
926144Srab  * should be just a single contiguous region that corresponds to all of
936144Srab  * physical memory.  The tricky bit is that Xen's heap sometimes lives in
946144Srab  * the middle of their segkpm, and is mapped using only kpm-like addresses.
956144Srab  * In that case, we need to skip the swathes before and after Xen's heap.
966144Srab  */
976144Srab uintptr_t kpm1_low = 0;
986144Srab uintptr_t kpm1_high = 0;
996144Srab uintptr_t kpm2_low = 0;
1006144Srab uintptr_t kpm2_high = 0;
1016144Srab 
1026144Srab /*
1035084Sjohnlev  * Some commonly used values that we don't want to recompute over and over.
1045084Sjohnlev  */
1055084Sjohnlev static int xpv_panic_nptes[MAX_NUM_LEVEL];
1065084Sjohnlev static ulong_t xpv_panic_cr3;
1075084Sjohnlev static uintptr_t xpv_end;
1085084Sjohnlev 
1095084Sjohnlev static void xpv_panic_console_print(const char *fmt, ...);
1105084Sjohnlev static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print;
1115084Sjohnlev 
1125084Sjohnlev #define	CONSOLE_BUF_SIZE	256
1135084Sjohnlev static char console_buffer[CONSOLE_BUF_SIZE];
1145084Sjohnlev static boolean_t use_polledio;
1155084Sjohnlev 
1165084Sjohnlev static void
1175084Sjohnlev xpv_panic_putc(int m)
1185084Sjohnlev {
1195084Sjohnlev 	struct cons_polledio *c = cons_polledio;
1205084Sjohnlev 
1215084Sjohnlev 	/* This really shouldn't happen */
1225084Sjohnlev 	if (console == CONS_HYPERVISOR)
1235084Sjohnlev 		return;
1245084Sjohnlev 
1255084Sjohnlev 	if (use_polledio == B_TRUE)
1265084Sjohnlev 		c->cons_polledio_putchar(c->cons_polledio_argument, m);
1275084Sjohnlev 	else
1285084Sjohnlev 		bcons_putchar(m);
1295084Sjohnlev }
1305084Sjohnlev 
1315084Sjohnlev static void
1325084Sjohnlev xpv_panic_puts(char *msg)
1335084Sjohnlev {
1345084Sjohnlev 	char *m;
1355084Sjohnlev 
1365084Sjohnlev 	dump_timeleft = dump_timeout;
1375084Sjohnlev 	for (m = msg; *m; m++)
1385084Sjohnlev 		xpv_panic_putc((int)*m);
1395084Sjohnlev }
1405084Sjohnlev 
1415084Sjohnlev static void
1425084Sjohnlev xpv_panic_console_print(const char *fmt, ...)
1435084Sjohnlev {
1445084Sjohnlev 	va_list ap;
1455084Sjohnlev 
1465084Sjohnlev 	va_start(ap, fmt);
1475084Sjohnlev 	(void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap);
1485084Sjohnlev 	va_end(ap);
1495084Sjohnlev 
1505084Sjohnlev 	xpv_panic_puts(console_buffer);
1515084Sjohnlev }
1525084Sjohnlev 
1535084Sjohnlev static void
1545084Sjohnlev xpv_panic_map(int level, pfn_t pfn)
1555084Sjohnlev {
1565084Sjohnlev 	x86pte_t pte, *pteptr;
1575084Sjohnlev 
1585084Sjohnlev 	/*
1595084Sjohnlev 	 * The provided pfn represents a level 'level' page table.  Map it
1605084Sjohnlev 	 * into the 'level' slot in the list of page table windows.
1615084Sjohnlev 	 */
1625084Sjohnlev 	pteptr = (x86pte_t *)PWIN_PTE_VA(level);
1635084Sjohnlev 	pte = pfn_to_pa(pfn) | PT_VALID;
1645084Sjohnlev 
1655084Sjohnlev 	XPV_ALLOW_PAGETABLE_UPDATES();
1665084Sjohnlev 	if (mmu.pae_hat)
1675084Sjohnlev 		*pteptr = pte;
1685084Sjohnlev 	else
1695084Sjohnlev 		*(x86pte32_t *)pteptr = pte;
1705084Sjohnlev 	XPV_DISALLOW_PAGETABLE_UPDATES();
1715084Sjohnlev 
1725084Sjohnlev 	mmu_tlbflush_entry(PWIN_VA(level));
1735084Sjohnlev }
1745084Sjohnlev 
1755084Sjohnlev /*
1765084Sjohnlev  * Walk the page tables to find the pfn mapped by the given va.
1775084Sjohnlev  */
1785084Sjohnlev static pfn_t
1795084Sjohnlev xpv_va_walk(uintptr_t *vaddr)
1805084Sjohnlev {
1815084Sjohnlev 	int l, idx;
1825084Sjohnlev 	pfn_t pfn;
1835084Sjohnlev 	x86pte_t pte;
1845084Sjohnlev 	x86pte_t *ptep;
1855084Sjohnlev 	uintptr_t va = *vaddr;
1865084Sjohnlev 	uintptr_t scan_va;
1875084Sjohnlev 	caddr_t ptable_window;
1885084Sjohnlev 	static pfn_t toplevel_pfn;
1895084Sjohnlev 	static uintptr_t lastva;
1905084Sjohnlev 
1915084Sjohnlev 	/*
1925084Sjohnlev 	 * If we do anything other than a simple scan through memory, don't
1935084Sjohnlev 	 * trust the mapped page tables.
1945084Sjohnlev 	 */
1955084Sjohnlev 	if (va != lastva + MMU_PAGESIZE)
1965084Sjohnlev 		for (l = mmu.max_level; l >= 0; l--)
1975084Sjohnlev 			ptable_pfn[l] = PFN_INVALID;
1985084Sjohnlev 
1995084Sjohnlev 	toplevel_pfn = mmu_btop(xpv_panic_cr3);
2005084Sjohnlev 
2015084Sjohnlev 	while (va < xpv_end && va >= *vaddr) {
2025084Sjohnlev 		/* Find the lowest table with any entry for va */
2035084Sjohnlev 		pfn = toplevel_pfn;
2045084Sjohnlev 		for (l = mmu.max_level; l >= 0; l--) {
2055084Sjohnlev 			if (ptable_pfn[l] != pfn) {
2065084Sjohnlev 				xpv_panic_map(l, pfn);
2075084Sjohnlev 				ptable_pfn[l] = pfn;
2085084Sjohnlev 			}
2095084Sjohnlev 
2105084Sjohnlev 			/*
2115084Sjohnlev 			 * Search this pagetable for any mapping to an
2125084Sjohnlev 			 * address >= va.
2135084Sjohnlev 			 */
2145084Sjohnlev 			ptable_window = PWIN_VA(l);
2155084Sjohnlev 			if (l == mmu.max_level && mmu.pae_hat)
2165084Sjohnlev 				ptable_window +=
2175084Sjohnlev 				    (xpv_panic_cr3 & MMU_PAGEOFFSET);
2185084Sjohnlev 
2195084Sjohnlev 			idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1);
2205084Sjohnlev 			scan_va = va;
2215084Sjohnlev 			while (idx < xpv_panic_nptes[l] && scan_va < xpv_end &&
2225084Sjohnlev 			    scan_va >= *vaddr) {
2235084Sjohnlev 				ptep = (x86pte_t *)(ptable_window +
2245084Sjohnlev 				    (idx << mmu.pte_size_shift));
2255084Sjohnlev 				pte = GET_PTE(ptep);
2265084Sjohnlev 				if (pte & PTE_VALID)
2275084Sjohnlev 					break;
2285084Sjohnlev 				idx++;
2295084Sjohnlev 				scan_va += mmu.level_size[l];
2305084Sjohnlev 			}
2315084Sjohnlev 
2325084Sjohnlev 			/*
2335084Sjohnlev 			 * If there are no valid mappings in this table, we
2345084Sjohnlev 			 * can skip to the end of the VA range it covers.
2355084Sjohnlev 			 */
2365084Sjohnlev 			if (idx == xpv_panic_nptes[l]) {
2375084Sjohnlev 				va = NEXT_ENTRY_VA(va, l + 1);
2385084Sjohnlev 				break;
2395084Sjohnlev 			}
2405084Sjohnlev 
2416144Srab 			va = scan_va;
2426144Srab 			/*
2436144Srab 			 * See if we've hit the end of the range.
2446144Srab 			 */
2456144Srab 			if (va >= xpv_end || va < *vaddr)
2466144Srab 				break;
2476144Srab 
2485084Sjohnlev 			/*
2495084Sjohnlev 			 * If this mapping is for a pagetable, we drop down
2505084Sjohnlev 			 * to the next level in the hierarchy and look for
2515084Sjohnlev 			 * a mapping in it.
2525084Sjohnlev 			 */
2535084Sjohnlev 			pfn = PTE2MFN(pte, l);
2545084Sjohnlev 			if (!PTE_ISPAGE(pte, l))
2555084Sjohnlev 				continue;
2565084Sjohnlev 
2575084Sjohnlev 			/*
2585084Sjohnlev 			 * The APIC page is magic.  Nothing to see here;
2595084Sjohnlev 			 * move along.
2605084Sjohnlev 			 */
2615084Sjohnlev 			if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) ==
2625084Sjohnlev 			    (va & MMU_PAGEMASK)) {
2635084Sjohnlev 				va += MMU_PAGESIZE;
2645084Sjohnlev 				break;
2655084Sjohnlev 			}
2665084Sjohnlev 
2676144Srab 			/*
2686144Srab 			 * See if the address is within one of the two
2696144Srab 			 * kpm-like regions we want to skip.
2706144Srab 			 */
2716144Srab 			if (va >= kpm1_low && va < kpm1_high) {
2726144Srab 				va = kpm1_high;
2736144Srab 				break;
2746144Srab 			}
2756144Srab 			if (va >= kpm2_low && va < kpm2_high) {
2766144Srab 				va = kpm2_high;
2775084Sjohnlev 				break;
2785084Sjohnlev 			}
2795084Sjohnlev 
2805084Sjohnlev 			/*
2815084Sjohnlev 			 * The Xen panic code only handles small pages.  If
2825084Sjohnlev 			 * this mapping is for a large page, we need to
2835084Sjohnlev 			 * identify the consituent page that covers the
2845084Sjohnlev 			 * specific VA we were looking for.
2855084Sjohnlev 			 */
2865084Sjohnlev 			if (l > 0) {
2875084Sjohnlev 				if (l > 1)
2885084Sjohnlev 					panic("Xen panic can't cope with "
2895084Sjohnlev 					    "giant pages.");
2905084Sjohnlev 				idx = (va >> LEVEL_SHIFT(0)) &
2915084Sjohnlev 				    (xpv_panic_nptes[0] - 1);
2925084Sjohnlev 				pfn += idx;
2935084Sjohnlev 			}
2945084Sjohnlev 
2955084Sjohnlev 			*vaddr = va;
2965084Sjohnlev 			lastva = va;
2975084Sjohnlev 			return (pfn | PFN_IS_FOREIGN_MFN);
2985084Sjohnlev 		}
2995084Sjohnlev 	}
3005084Sjohnlev 	return (PFN_INVALID);
3015084Sjohnlev }
3025084Sjohnlev 
3035084Sjohnlev /*
3045084Sjohnlev  * Walk through the Xen VA space, finding pages that are mapped in.
3055084Sjohnlev  *
3065084Sjohnlev  * These pages all have MFNs rather than PFNs, meaning they may be outside
3075084Sjohnlev  * the physical address space the kernel knows about, or they may collide
3085084Sjohnlev  * with PFNs the kernel is using.
3095084Sjohnlev  *
3105084Sjohnlev  * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs
3115084Sjohnlev  * to avoid collisions doesn't work.  The pages need to be written to disk
3125084Sjohnlev  * in PFN-order or savecore gets confused.  We can't allocate memory to
3135084Sjohnlev  * contruct a sorted pfn->VA reverse mapping, so we have to write the pages
3145084Sjohnlev  * to disk in VA order.
3155084Sjohnlev  *
3165084Sjohnlev  * To square this circle, we simply make up PFNs for each of Xen's pages.
3175084Sjohnlev  * We assign each mapped page a fake PFN in ascending order.  These fake
3185084Sjohnlev  * PFNs each have the FOREIGN bit set, ensuring that they fall outside the
3195084Sjohnlev  * range of Solaris PFNs written by the kernel.
3205084Sjohnlev  */
3215084Sjohnlev int
3225084Sjohnlev dump_xpv_addr()
3235084Sjohnlev {
3245084Sjohnlev 	uintptr_t va;
3255084Sjohnlev 	mem_vtop_t mem_vtop;
3265084Sjohnlev 
3275084Sjohnlev 	xpv_dump_pages = 0;
3285084Sjohnlev 	va = xen_virt_start;
3295084Sjohnlev 
3305084Sjohnlev 	while (xpv_va_walk(&va) != PFN_INVALID) {
3315084Sjohnlev 		mem_vtop.m_as = &kas;
3325084Sjohnlev 		mem_vtop.m_va = (void *)va;
3335084Sjohnlev 		mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
3345084Sjohnlev 
3355084Sjohnlev 		dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
3365084Sjohnlev 		xpv_dump_pages++;
3375084Sjohnlev 
3385084Sjohnlev 		va += MMU_PAGESIZE;
3395084Sjohnlev 	}
3405084Sjohnlev 
3415084Sjohnlev 	/*
3425084Sjohnlev 	 * Add the shared_info page.  This page actually ends up in the
3435084Sjohnlev 	 * dump twice: once for the Xen va and once for the Solaris va.
3445084Sjohnlev 	 * This isn't ideal, but we don't know the address Xen is using for
3455084Sjohnlev 	 * the page, so we can't share it.
3465084Sjohnlev 	 */
3475084Sjohnlev 	mem_vtop.m_as = &kas;
3485084Sjohnlev 	mem_vtop.m_va = HYPERVISOR_shared_info;
3495084Sjohnlev 	mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
3505084Sjohnlev 	dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
3515084Sjohnlev 	xpv_dump_pages++;
3525084Sjohnlev 
3535084Sjohnlev 	return (xpv_dump_pages);
3545084Sjohnlev }
3555084Sjohnlev 
3565084Sjohnlev void
3575084Sjohnlev dump_xpv_pfn()
3585084Sjohnlev {
3595084Sjohnlev 	pfn_t pfn;
3605084Sjohnlev 	int cnt;
3615084Sjohnlev 
3625084Sjohnlev 	for (cnt = 0; cnt < xpv_dump_pages; cnt++) {
3635084Sjohnlev 		pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN;
3645084Sjohnlev 		dumpvp_write(&pfn, sizeof (pfn));
3655084Sjohnlev 	}
3665084Sjohnlev }
3675084Sjohnlev 
3685084Sjohnlev int
3695084Sjohnlev dump_xpv_data(void *dump_cbuf)
3705084Sjohnlev {
3715084Sjohnlev 	uintptr_t va;
3725084Sjohnlev 	uint32_t csize;
3735084Sjohnlev 	int cnt = 0;
3745084Sjohnlev 
3755084Sjohnlev 	/*
3765084Sjohnlev 	 * XXX: we should probably run this data through a UE check.  The
3775084Sjohnlev 	 * catch is that the UE code relies on on_trap() and getpfnum()
3785084Sjohnlev 	 * working.
3795084Sjohnlev 	 */
3805084Sjohnlev 	va = xen_virt_start;
3815084Sjohnlev 
3825084Sjohnlev 	while (xpv_va_walk(&va) != PFN_INVALID) {
3835084Sjohnlev 		csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE);
3845084Sjohnlev 		dumpvp_write(&csize, sizeof (uint32_t));
3855084Sjohnlev 		dumpvp_write(dump_cbuf, csize);
3865084Sjohnlev 		if (dump_ioerr) {
3875084Sjohnlev 			dumphdr->dump_flags &= ~DF_COMPLETE;
3885084Sjohnlev 			return (cnt);
3895084Sjohnlev 		}
3905084Sjohnlev 		cnt++;
3915084Sjohnlev 		va += MMU_PAGESIZE;
3925084Sjohnlev 	}
3935084Sjohnlev 
3945084Sjohnlev 	/*
3955084Sjohnlev 	 * Finally, dump the shared_info page
3965084Sjohnlev 	 */
3975084Sjohnlev 	csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf,
3985084Sjohnlev 	    PAGESIZE);
3995084Sjohnlev 	dumpvp_write(&csize, sizeof (uint32_t));
4005084Sjohnlev 	dumpvp_write(dump_cbuf, csize);
4015084Sjohnlev 	if (dump_ioerr)
4025084Sjohnlev 		dumphdr->dump_flags &= ~DF_COMPLETE;
4035084Sjohnlev 	cnt++;
4045084Sjohnlev 
4055084Sjohnlev 	return (cnt);
4065084Sjohnlev }
4075084Sjohnlev 
4085084Sjohnlev static void *
4095084Sjohnlev showstack(void *fpreg, int xpv_only)
4105084Sjohnlev {
4115084Sjohnlev 	struct frame *fpp;
4125084Sjohnlev 	ulong_t off;
4135084Sjohnlev 	char *sym;
4145084Sjohnlev 	uintptr_t pc, fp, lastfp;
4155084Sjohnlev 	uintptr_t minaddr = min(KERNELBASE, xen_virt_start);
4165084Sjohnlev 
4175084Sjohnlev 	fp = (uintptr_t)fpreg;
4185084Sjohnlev 	if (fp < minaddr) {
4195084Sjohnlev 		xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg);
4205084Sjohnlev 		return (fpreg);
4215084Sjohnlev 	}
4225084Sjohnlev 
4235084Sjohnlev 	do {
4245084Sjohnlev 		fpp = (struct frame *)fp;
4255084Sjohnlev 		pc = fpp->fr_savpc;
4265084Sjohnlev 
4275084Sjohnlev 		if ((xpv_only != 0) &&
4285084Sjohnlev 		    (fp > xpv_end || fp < xen_virt_start))
4295084Sjohnlev 			break;
4305084Sjohnlev 		if ((sym = kobj_getsymname(pc, &off)) != NULL)
4315084Sjohnlev 			xpv_panic_printf("%08lx %s:%s+%lx\n", fp,
4325084Sjohnlev 			    mod_containing_pc((caddr_t)pc), sym, off);
4335084Sjohnlev 		else if ((pc >= xen_virt_start) && (pc <= xpv_end))
4345084Sjohnlev 			xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc);
4355084Sjohnlev 		else
4365084Sjohnlev 			xpv_panic_printf("%08lx %lx\n", fp, pc);
4375084Sjohnlev 
4385084Sjohnlev 		lastfp = fp;
4395084Sjohnlev 		fp = fpp->fr_savfp;
4405084Sjohnlev 
4415084Sjohnlev 		/*
4425084Sjohnlev 		 * Xen marks an exception frame by inverting the frame
4435084Sjohnlev 		 * pointer.
4445084Sjohnlev 		 */
4455084Sjohnlev 		if (fp < lastfp) {
4465084Sjohnlev 			if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff)
4475084Sjohnlev 				fp = ~fp;
4485084Sjohnlev 		}
4495084Sjohnlev 	} while (fp > lastfp);
4505084Sjohnlev 	return ((void *)fp);
4515084Sjohnlev }
4525084Sjohnlev 
4535084Sjohnlev void *
4545084Sjohnlev xpv_traceback(void *fpreg)
4555084Sjohnlev {
4565084Sjohnlev 	return (showstack(fpreg, 1));
4575084Sjohnlev }
4585084Sjohnlev 
4595084Sjohnlev #if defined(__amd64)
4605084Sjohnlev static void
4615084Sjohnlev xpv_panic_hypercall(ulong_t call)
4625084Sjohnlev {
4635084Sjohnlev 	panic("Illegally issued hypercall %d during panic!\n", (int)call);
4645084Sjohnlev }
4655084Sjohnlev #endif
4665084Sjohnlev 
4675084Sjohnlev void
4685084Sjohnlev xpv_die(struct regs *rp)
4695084Sjohnlev {
4705084Sjohnlev 	struct panic_trap_info ti;
4715084Sjohnlev 	struct cregs creg;
4725084Sjohnlev 
4735084Sjohnlev 	ti.trap_regs = rp;
4745084Sjohnlev 	ti.trap_type = rp->r_trapno;
4755084Sjohnlev 
4765084Sjohnlev 	curthread->t_panic_trap = &ti;
4775084Sjohnlev 	if (ti.trap_type == T_PGFLT) {
4785084Sjohnlev 		getcregs(&creg);
4795084Sjohnlev 		ti.trap_addr = (caddr_t)creg.cr_cr2;
4805084Sjohnlev 		panic("Fatal pagefault at 0x%lx.  fault addr=0x%p  rp=0x%p",
481*7240Srh87107 		    rp->r_pc, (void *)ti.trap_addr, (void *)rp);
4825084Sjohnlev 	} else {
4835084Sjohnlev 		ti.trap_addr = (caddr_t)rp->r_pc;
4845084Sjohnlev 		panic("Fatal trap %ld at 0x%lx.  rp=0x%p", rp->r_trapno,
485*7240Srh87107 		    rp->r_pc, (void *)rp);
4865084Sjohnlev 	}
4875084Sjohnlev }
4885084Sjohnlev 
4895084Sjohnlev /*
4905084Sjohnlev  * Build IDT to handle a Xen panic
4915084Sjohnlev  */
4925084Sjohnlev static void
4935084Sjohnlev switch_to_xpv_panic_idt()
4945084Sjohnlev {
4955084Sjohnlev 	int i;
4965084Sjohnlev 	desctbr_t idtr;
4975084Sjohnlev 	gate_desc_t *idt = xpv_panic_idt;
4985084Sjohnlev 	selector_t cs = get_cs_register();
4995084Sjohnlev 
5005084Sjohnlev 	for (i = 0; i < 32; i++)
5015084Sjohnlev 		set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL);
5025084Sjohnlev 
5035084Sjohnlev 	set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL);
5045084Sjohnlev 	set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL);
5055084Sjohnlev 	set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL);
5065084Sjohnlev 	set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT,
5075084Sjohnlev 	    TRP_XPL);
5085084Sjohnlev 	set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL);
5095084Sjohnlev 	set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL);
5105084Sjohnlev 	set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL);
5115084Sjohnlev 	set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL);
5125084Sjohnlev 	set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL);
5135084Sjohnlev 	set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL);
5145084Sjohnlev 	set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL);
5155084Sjohnlev 	set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL);
5165084Sjohnlev 	set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL);
5175084Sjohnlev 	set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL);
5185084Sjohnlev 	set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL);
5195084Sjohnlev 
5205084Sjohnlev 	/*
5215084Sjohnlev 	 * We have no double fault handler.  Any single fault represents a
5225084Sjohnlev 	 * catastrophic failure for us, so there is no attempt to handle
5235084Sjohnlev 	 * them cleanly: we just print a message and reboot.  If we
5245084Sjohnlev 	 * encounter a second fault while doing that, there is nothing
5255084Sjohnlev 	 * else we can do.
5265084Sjohnlev 	 */
5275084Sjohnlev 
5285084Sjohnlev 	/*
5295084Sjohnlev 	 * Be prepared to absorb any stray device interrupts received
5305084Sjohnlev 	 * while writing the core to disk.
5315084Sjohnlev 	 */
5325084Sjohnlev 	for (i = 33; i < NIDT; i++)
5335084Sjohnlev 		set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT,
5345084Sjohnlev 		    TRP_XPL);
5355084Sjohnlev 
5365084Sjohnlev 	/* The one interrupt we expect to get is from the APIC timer.  */
5375084Sjohnlev 	set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT,
5385084Sjohnlev 	    TRP_XPL);
5395084Sjohnlev 
5405084Sjohnlev 	idtr.dtr_base = (uintptr_t)xpv_panic_idt;
5415084Sjohnlev 	idtr.dtr_limit = sizeof (xpv_panic_idt) - 1;
5425084Sjohnlev 	wr_idtr(&idtr);
5435084Sjohnlev 
5445084Sjohnlev #if defined(__amd64)
5455084Sjohnlev 	/* Catch any hypercalls. */
5465084Sjohnlev 	wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall);
5475084Sjohnlev 	wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall);
5485084Sjohnlev #endif
5495084Sjohnlev }
5505084Sjohnlev 
5515084Sjohnlev static void
5525084Sjohnlev xpv_apic_clkinit()
5535084Sjohnlev {
5545084Sjohnlev 	uint_t		apic_ticks = 0;
5555084Sjohnlev 
5565084Sjohnlev 	/*
5575084Sjohnlev 	 * Measure how many APIC ticks there are within a fixed time
5585084Sjohnlev 	 * period.  We're going to be fairly coarse here.  This timer is
5595084Sjohnlev 	 * just being used to detect a stalled panic, so as long as we have
5605084Sjohnlev 	 * the right order of magnitude, everything should be fine.
5615084Sjohnlev 	 */
5625084Sjohnlev 	xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR;
5635084Sjohnlev 	xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK;
5645084Sjohnlev 	xpv_apicadr[APIC_INT_VECT0] = AV_MASK;	/* local intr reg 0 */
5655084Sjohnlev 
5665084Sjohnlev 	xpv_apicadr[APIC_DIVIDE_REG] = 0;
5675084Sjohnlev 	xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL;
5685084Sjohnlev 	drv_usecwait(XPV_TIMER_INTERVAL);
5695084Sjohnlev 	apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT];
5705084Sjohnlev 
5715084Sjohnlev 	/*
5725084Sjohnlev 	 * apic_ticks now represents roughly how many apic ticks comprise
5735084Sjohnlev 	 * one timeout interval.  Program the timer to send us an interrupt
5745084Sjohnlev 	 * every time that interval expires.
5755084Sjohnlev 	 */
5765084Sjohnlev 	xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_TIME;
5775084Sjohnlev 	xpv_apicadr[APIC_INIT_COUNT] = apic_ticks;
5785084Sjohnlev 	xpv_apicadr[APIC_EOI_REG] = 0;
5795084Sjohnlev }
5805084Sjohnlev 
5815084Sjohnlev void
5825084Sjohnlev xpv_timer_tick(void)
5835084Sjohnlev {
5845084Sjohnlev 	static int ticks = 0;
5855084Sjohnlev 
5865084Sjohnlev 	if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) {
5875084Sjohnlev 		ticks = 0;
5885084Sjohnlev 		if (dump_timeleft && (--dump_timeleft == 0))
5895084Sjohnlev 			panic("Xen panic timeout\n");
5905084Sjohnlev 	}
5915084Sjohnlev 	xpv_apicadr[APIC_EOI_REG] = 0;
5925084Sjohnlev }
5935084Sjohnlev 
5945084Sjohnlev void
5955084Sjohnlev xpv_interrupt(void)
5965084Sjohnlev {
5975084Sjohnlev #ifdef	DEBUG
5985084Sjohnlev 	static int cnt = 0;
5995084Sjohnlev 
6005084Sjohnlev 	if (cnt++ < 10)
6015084Sjohnlev 		xpv_panic_printf("Unexpected interrupt received.\n");
6025084Sjohnlev 	if ((cnt < 1000) && ((cnt % 100) == 0))
6035084Sjohnlev 		xpv_panic_printf("%d unexpected interrupts received.\n", cnt);
6045084Sjohnlev #endif
6055084Sjohnlev 
6065084Sjohnlev 	xpv_apicadr[APIC_EOI_REG] = 0;
6075084Sjohnlev }
6085084Sjohnlev 
6095084Sjohnlev /*
6105084Sjohnlev  * Managing time in panic context is trivial.  We only have a single CPU,
6115084Sjohnlev  * we never get rescheduled, we never get suspended.  We just need to
6125084Sjohnlev  * convert clock ticks into nanoseconds.
6135084Sjohnlev  */
6145084Sjohnlev static hrtime_t
6155084Sjohnlev xpv_panic_gethrtime(void)
6165084Sjohnlev {
6175084Sjohnlev 	hrtime_t tsc, hrt;
6185084Sjohnlev 	unsigned int *l = (unsigned int *)&(tsc);
6195084Sjohnlev 
6205084Sjohnlev 	tsc = __rdtsc_insn();
6215084Sjohnlev 	hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) +
6225084Sjohnlev 	    (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT));
6235084Sjohnlev 
6245084Sjohnlev 	return (hrt);
6255084Sjohnlev }
6265084Sjohnlev 
6275084Sjohnlev static void
6285084Sjohnlev xpv_panic_time_init()
6295084Sjohnlev {
6305084Sjohnlev 	nsec_scale =
6315084Sjohnlev 	    CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT;
6325084Sjohnlev 
6335084Sjohnlev 	gethrtimef = xpv_panic_gethrtime;
6345084Sjohnlev }
6355084Sjohnlev 
6365084Sjohnlev static void
6375084Sjohnlev xpv_panicsys(struct regs *rp, char *fmt, ...)
6385084Sjohnlev {
6395084Sjohnlev 	extern void panicsys(const char *, va_list, struct regs *, int);
6405084Sjohnlev 	va_list alist;
6415084Sjohnlev 
6425084Sjohnlev 	va_start(alist, fmt);
6435084Sjohnlev 	panicsys(fmt, alist, rp, 1);
6445084Sjohnlev 	va_end(alist);
6455084Sjohnlev }
6465084Sjohnlev 
6475084Sjohnlev void
6485084Sjohnlev xpv_do_panic(void *arg)
6495084Sjohnlev {
6505084Sjohnlev 	struct panic_info *pip = (struct panic_info *)arg;
6515084Sjohnlev 	int l;
6525084Sjohnlev 	struct cregs creg;
6535084Sjohnlev #if defined(__amd64)
6545084Sjohnlev 	extern uintptr_t postbootkernelbase;
6555084Sjohnlev #endif
6565084Sjohnlev 
6575084Sjohnlev 	if (xpv_panicking++ > 0)
6585084Sjohnlev 		panic("multiple calls to xpv_do_panic()");
6595084Sjohnlev 
6605084Sjohnlev 	/*
6615084Sjohnlev 	 * Indicate to the underlying panic framework that a panic has been
6625084Sjohnlev 	 * initiated.  This is ordinarily done as part of vpanic().  Since
6635084Sjohnlev 	 * we already have all the register state saved by the hypervisor,
6645084Sjohnlev 	 * we skip that and jump straight into the panic processing code.
6655084Sjohnlev 	 */
6665084Sjohnlev 	(void) panic_trigger(&panic_quiesce);
6675084Sjohnlev 
6685084Sjohnlev #if defined(__amd64)
6695084Sjohnlev 	/*
6705084Sjohnlev 	 * bzero() and bcopy() get unhappy when asked to operate on
6715084Sjohnlev 	 * addresses outside of the kernel.  At this point Xen is really a
6725084Sjohnlev 	 * part of the kernel, so we update the routines' notion of where
6735084Sjohnlev 	 * the kernel starts.
6745084Sjohnlev 	 */
6755084Sjohnlev 	postbootkernelbase = xen_virt_start;
6765084Sjohnlev #endif
6775084Sjohnlev 
6785084Sjohnlev #if defined(HYPERVISOR_VIRT_END)
6795084Sjohnlev 	xpv_end = HYPERVISOR_VIRT_END;
6805084Sjohnlev #else
6815084Sjohnlev 	xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t);
6825084Sjohnlev #endif
6835084Sjohnlev 
6845084Sjohnlev 	/*
6855084Sjohnlev 	 * If we were redirecting console output to the hypervisor, we have
6865084Sjohnlev 	 * to stop.
6875084Sjohnlev 	 */
6885084Sjohnlev 	use_polledio = B_FALSE;
6895084Sjohnlev 	if (console == CONS_HYPERVISOR) {
6905084Sjohnlev 		bcons_device_change(CONS_HYPERVISOR);
6915084Sjohnlev 	} else if (cons_polledio != NULL &&
6925084Sjohnlev 	    cons_polledio->cons_polledio_putchar != NULL)  {
6935084Sjohnlev 		if (cons_polledio->cons_polledio_enter != NULL)
6945084Sjohnlev 			cons_polledio->cons_polledio_enter(
6955084Sjohnlev 			    cons_polledio->cons_polledio_argument);
6965084Sjohnlev 		use_polledio = 1;
6975084Sjohnlev 	}
6985084Sjohnlev 
6995084Sjohnlev 	/* Make sure we handle all console output from here on. */
7005084Sjohnlev 	sysp->bsvc_putchar = xpv_panic_putc;
7015084Sjohnlev 
7025084Sjohnlev 	/*
7035084Sjohnlev 	 * If we find an unsupported panic_info structure, there's not much
7045084Sjohnlev 	 * we can do other than complain, plow on, and hope for the best.
7055084Sjohnlev 	 */
7065084Sjohnlev 	if (pip->pi_version != PANIC_INFO_VERSION)
7075084Sjohnlev 		xpv_panic_printf("Warning: Xen is using an unsupported "
7085084Sjohnlev 		    "version of the panic_info structure.\n");
7095084Sjohnlev 
7105084Sjohnlev 	xpv_panic_info = pip;
7115084Sjohnlev 
7126144Srab #if defined(__amd64)
7136144Srab 	kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start;
7146144Srab 	if (xpv_panic_info->pi_xen_start == NULL) {
7156144Srab 		kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end;
7166144Srab 	} else {
7176144Srab 		kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start;
7186144Srab 		kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end;
7196144Srab 		kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end;
7206144Srab 	}
7216144Srab #endif
7226144Srab 
7235084Sjohnlev 	/*
7245084Sjohnlev 	 * Make sure we are running on the Solaris %gs.  The Xen panic code
7255084Sjohnlev 	 * should already have set up the GDT properly.
7265084Sjohnlev 	 */
7275084Sjohnlev 	xpv_panic_resetgs();
7285084Sjohnlev #if defined(__amd64)
7295084Sjohnlev 	wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
7305084Sjohnlev #endif
7315084Sjohnlev 
7325084Sjohnlev 	xpv_panic_time_init();
7335084Sjohnlev 
7345084Sjohnlev 	/*
7355084Sjohnlev 	 * Switch to our own IDT, avoiding any accidental returns to Xen
7365084Sjohnlev 	 * world.
7375084Sjohnlev 	 */
7385084Sjohnlev 	switch_to_xpv_panic_idt();
7395084Sjohnlev 
7405084Sjohnlev 	/*
7415084Sjohnlev 	 * Initialize the APIC timer, which is used to detect a hung dump
7425084Sjohnlev 	 * attempt.
7435084Sjohnlev 	 */
7445084Sjohnlev 	xpv_apicadr = pip->pi_apic;
7455084Sjohnlev 	xpv_apic_clkinit();
7465084Sjohnlev 
7475084Sjohnlev 	/*
7485084Sjohnlev 	 * Set up a few values that we'll need repeatedly.
7495084Sjohnlev 	 */
7505084Sjohnlev 	getcregs(&creg);
7515084Sjohnlev 	xpv_panic_cr3 = creg.cr_cr3;
7525084Sjohnlev 	for (l = mmu.max_level; l >= 0; l--)
7535084Sjohnlev 		xpv_panic_nptes[l] = mmu.ptes_per_table;
7545084Sjohnlev #ifdef __i386
7555084Sjohnlev 	if (mmu.pae_hat)
7565084Sjohnlev 		xpv_panic_nptes[mmu.max_level] = 4;
7575084Sjohnlev #endif
7585084Sjohnlev 
7595084Sjohnlev 	/* Add the fake Xen module to the module list */
7605084Sjohnlev 	if (xpv_module != NULL) {
7615084Sjohnlev 		extern int last_module_id;
7625084Sjohnlev 
7635084Sjohnlev 		xpv_modctl->mod_id = last_module_id++;
7645084Sjohnlev 		xpv_modctl->mod_next = &modules;
7655084Sjohnlev 		xpv_modctl->mod_prev = modules.mod_prev;
7665084Sjohnlev 		modules.mod_prev->mod_next = xpv_modctl;
7675084Sjohnlev 		modules.mod_prev = xpv_modctl;
7685084Sjohnlev 	}
7695084Sjohnlev 	xpv_panic_printf = printf;
7705084Sjohnlev 	xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr);
7715084Sjohnlev 	xpv_panic_printf("Failed to reboot following panic.\n");
7725084Sjohnlev 	for (;;)
7735084Sjohnlev 		;
7745084Sjohnlev }
7755084Sjohnlev 
7765084Sjohnlev /*
7775084Sjohnlev  * Set up the necessary data structures to pretend that the Xen hypervisor
7785084Sjohnlev  * is a loadable module, allowing mdb to find the Xen symbols in a crash
7795084Sjohnlev  * dump.  Since these symbols all map to VA space Solaris doesn't normally
7805084Sjohnlev  * have access to, we don't link these structures into the kernel's lists
7815084Sjohnlev  * until/unless we hit a Xen panic.
7825084Sjohnlev  *
7835084Sjohnlev  * The observant reader will note a striking amount of overlap between this
7845084Sjohnlev  * code and that found in krtld.  While it would be handy if we could just
7855084Sjohnlev  * ask krtld to do this work for us, it's not that simple.  Among the
7865084Sjohnlev  * complications: we're not actually loading the text here (grub did it at
7875084Sjohnlev  * boot), the .text section is writable, there are no relocations to do,
7885084Sjohnlev  * none of the module text/data is in readable memory, etc.  Training krtld
7895084Sjohnlev  * to deal with this weird module is as complicated, and more risky, than
7905084Sjohnlev  * reimplementing the necessary subset of it here.
7915084Sjohnlev  */
7925084Sjohnlev static void
7935084Sjohnlev init_xen_module()
7945084Sjohnlev {
7955084Sjohnlev 	struct _buf *file = NULL;
7965084Sjohnlev 	struct module *mp;
7975084Sjohnlev 	struct modctl *mcp;
7985084Sjohnlev 	int i, shn;
7995084Sjohnlev 	Shdr *shp, *ctf_shp;
8005084Sjohnlev 	char *names = NULL;
8015084Sjohnlev 	size_t n, namesize, text_align, data_align;
8025084Sjohnlev #if defined(__amd64)
8035084Sjohnlev 	const char machine = EM_AMD64;
8045084Sjohnlev #else
8055084Sjohnlev 	const char machine = EM_386;
8065084Sjohnlev #endif
8075084Sjohnlev 
8085084Sjohnlev 	/* Allocate and init the module structure */
8095084Sjohnlev 	mp = kmem_zalloc(sizeof (*mp), KM_SLEEP);
8105084Sjohnlev 	mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
8115084Sjohnlev 	(void) strcpy(mp->filename, XPV_FILENAME);
8125084Sjohnlev 
8135084Sjohnlev 	/* Allocate and init the modctl structure */
8145084Sjohnlev 	mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP);
8155084Sjohnlev 	mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP);
8165084Sjohnlev 	(void) strcpy(mcp->mod_modname, XPV_MODNAME);
8175084Sjohnlev 	mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
8185084Sjohnlev 	(void) strcpy(mcp->mod_filename, XPV_FILENAME);
8195084Sjohnlev 	mcp->mod_inprogress_thread = (kthread_id_t)-1;
8205084Sjohnlev 	mcp->mod_ref = 1;
8215084Sjohnlev 	mcp->mod_loaded = 1;
8225084Sjohnlev 	mcp->mod_loadcnt = 1;
8235084Sjohnlev 	mcp->mod_mp = mp;
8245084Sjohnlev 
8255084Sjohnlev 	/*
8265084Sjohnlev 	 * Try to open a Xen image that hasn't had its symbol and CTF
8275084Sjohnlev 	 * information stripped off.
8285084Sjohnlev 	 */
8295084Sjohnlev 	file = kobj_open_file(XPV_FILENAME);
8305084Sjohnlev 	if (file == (struct _buf *)-1) {
8315084Sjohnlev 		file = NULL;
8325084Sjohnlev 		goto err;
8335084Sjohnlev 	}
8345084Sjohnlev 
8355084Sjohnlev 	/*
8365084Sjohnlev 	 * Read the header and ensure that this is an ELF file for the
8375084Sjohnlev 	 * proper ISA.  If it's not, somebody has done something very
8385084Sjohnlev 	 * stupid.  Why bother?  See Mencken.
8395084Sjohnlev 	 */
8405084Sjohnlev 	if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0)
8415084Sjohnlev 		goto err;
8425084Sjohnlev 	for (i = 0; i < SELFMAG; i++)
8435084Sjohnlev 		if (mp->hdr.e_ident[i] != ELFMAG[i])
8445084Sjohnlev 			goto err;
8455084Sjohnlev 	if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) ||
8465084Sjohnlev 	    (mp->hdr.e_machine != machine))
8475084Sjohnlev 		goto err;
8485084Sjohnlev 
8495084Sjohnlev 	/* Read in the section headers */
8505084Sjohnlev 	n = mp->hdr.e_shentsize * mp->hdr.e_shnum;
8515084Sjohnlev 	mp->shdrs = kmem_zalloc(n, KM_SLEEP);
8525084Sjohnlev 	if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0)
8535084Sjohnlev 		goto err;
8545084Sjohnlev 
8555084Sjohnlev 	/* Read the section names */
8565084Sjohnlev 	shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize);
8575084Sjohnlev 	namesize = shp->sh_size;
8585084Sjohnlev 	names = kmem_zalloc(shp->sh_size, KM_SLEEP);
8595084Sjohnlev 	if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0)
8605084Sjohnlev 		goto err;
8615084Sjohnlev 
8625084Sjohnlev 	/*
8635084Sjohnlev 	 * Fill in the text and data size fields.
8645084Sjohnlev 	 */
8655084Sjohnlev 	ctf_shp = NULL;
8665084Sjohnlev 	text_align = data_align = 0;
8675084Sjohnlev 	for (shn = 1; shn < mp->hdr.e_shnum; shn++) {
8685084Sjohnlev 		shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize);
8695084Sjohnlev 
8705084Sjohnlev 		/* Sanity check the offset of the section name */
8715084Sjohnlev 		if (shp->sh_name >= namesize)
8725084Sjohnlev 			continue;
8735084Sjohnlev 
8745084Sjohnlev 		/* If we find the symtab section, remember it for later. */
8755084Sjohnlev 		if (shp->sh_type == SHT_SYMTAB) {
8765084Sjohnlev 			mp->symtbl_section = shn;
8775084Sjohnlev 			mp->symhdr = shp;
8785084Sjohnlev 			continue;
8795084Sjohnlev 		}
8805084Sjohnlev 
8815084Sjohnlev 		/* If we find the CTF section, remember it for later. */
8825084Sjohnlev 		if ((shp->sh_size != 0) &&
8835084Sjohnlev 		    (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) {
8845084Sjohnlev 			ctf_shp = shp;
8855084Sjohnlev 			continue;
8865084Sjohnlev 		}
8875084Sjohnlev 
8885084Sjohnlev 		if (!(shp->sh_flags & SHF_ALLOC))
8895084Sjohnlev 			continue;
8905084Sjohnlev 
8915084Sjohnlev 		/*
8925084Sjohnlev 		 * Xen marks its text section as writable, so we need to
8935084Sjohnlev 		 * look for the name - not just the flag.
8945084Sjohnlev 		 */
8955084Sjohnlev 		if ((strcmp(&names[shp->sh_name], ".text") != NULL) &&
8965084Sjohnlev 		    (shp->sh_flags & SHF_WRITE) != 0) {
8975084Sjohnlev 			if (shp->sh_addralign > data_align)
8985084Sjohnlev 				data_align = shp->sh_addralign;
8995084Sjohnlev 			mp->data_size = ALIGN(mp->data_size, data_align);
9005084Sjohnlev 			mp->data_size += ALIGN(shp->sh_size, 8);
9015084Sjohnlev 			if (mp->data == NULL || mp->data > (char *)shp->sh_addr)
9025084Sjohnlev 				mp->data = (char *)shp->sh_addr;
9035084Sjohnlev 		} else {
9045084Sjohnlev 			if (shp->sh_addralign > text_align)
9055084Sjohnlev 				text_align = shp->sh_addralign;
9065084Sjohnlev 			mp->text_size = ALIGN(mp->text_size, text_align);
9075084Sjohnlev 			mp->text_size += ALIGN(shp->sh_size, 8);
9085084Sjohnlev 			if (mp->text == NULL || mp->text > (char *)shp->sh_addr)
9095084Sjohnlev 				mp->text = (char *)shp->sh_addr;
9105084Sjohnlev 		}
9115084Sjohnlev 	}
9125084Sjohnlev 	kmem_free(names, namesize);
9135084Sjohnlev 	names = NULL;
9145249Snn35248 	shp = NULL;
9155084Sjohnlev 	mcp->mod_text = mp->text;
9165084Sjohnlev 	mcp->mod_text_size = mp->text_size;
9175084Sjohnlev 
9185084Sjohnlev 	/*
9195084Sjohnlev 	 * If we have symbol table and string table sections, read them in
9205084Sjohnlev 	 * now.  If we don't, we just plow on.  We'll still get a valid
9215084Sjohnlev 	 * core dump, but finding anything useful will be just a bit
9225084Sjohnlev 	 * harder.
9235084Sjohnlev 	 *
9245084Sjohnlev 	 * Note: we don't bother with a hash table.  We'll never do a
9255084Sjohnlev 	 * symbol lookup unless we crash, and then mdb creates its own.  We
9265084Sjohnlev 	 * also don't try to perform any relocations.  Xen should be loaded
9275084Sjohnlev 	 * exactly where the ELF file indicates, and the symbol information
9285084Sjohnlev 	 * in the file should be complete and correct already.  Static
9295084Sjohnlev 	 * linking ain't all bad.
9305084Sjohnlev 	 */
9315084Sjohnlev 	if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) {
9325084Sjohnlev 		mp->strhdr = (Shdr *)
9335084Sjohnlev 		    (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize);
9345084Sjohnlev 		mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize;
9355084Sjohnlev 
9365084Sjohnlev 		/* Allocate space for the symbol table and strings.  */
9375084Sjohnlev 		mp->symsize = mp->symhdr->sh_size +
9385084Sjohnlev 		    mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size;
9395084Sjohnlev 		mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP);
9405084Sjohnlev 		mp->symtbl = mp->symspace;
9415084Sjohnlev 		mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size);
9425084Sjohnlev 
9435084Sjohnlev 		if ((kobj_read_file(file, mp->symtbl,
9445084Sjohnlev 		    mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) ||
9455084Sjohnlev 		    (kobj_read_file(file, mp->strings,
9465084Sjohnlev 		    mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0))
9475084Sjohnlev 			goto err;
9485084Sjohnlev 	}
9495084Sjohnlev 
9505084Sjohnlev 	/*
9515084Sjohnlev 	 * Read in the CTF section
9525084Sjohnlev 	 */
9535084Sjohnlev 	if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) {
9545249Snn35248 		mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP);
9555084Sjohnlev 		mp->ctfsize = ctf_shp->sh_size;
9565084Sjohnlev 		if (kobj_read_file(file, mp->ctfdata, mp->ctfsize,
9575084Sjohnlev 		    ctf_shp->sh_offset) < 0)
9585084Sjohnlev 			goto err;
9595084Sjohnlev 	}
9605084Sjohnlev 
9615084Sjohnlev 	kobj_close_file(file);
9625084Sjohnlev 
9635084Sjohnlev 	xpv_module = mp;
9645084Sjohnlev 	xpv_modctl = mcp;
9655084Sjohnlev 	return;
9665084Sjohnlev 
9675084Sjohnlev err:
9685084Sjohnlev 	cmn_err(CE_WARN, "Failed to initialize xpv module.");
9695084Sjohnlev 	if (file != NULL)
9705084Sjohnlev 		kobj_close_file(file);
9715084Sjohnlev 
9725084Sjohnlev 	kmem_free(mp->filename, strlen(XPV_FILENAME) + 1);
9735084Sjohnlev 	if (mp->shdrs != NULL)
9745084Sjohnlev 		kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum);
9755084Sjohnlev 	if (mp->symspace != NULL)
9765084Sjohnlev 		kmem_free(mp->symspace, mp->symsize);
9775084Sjohnlev 	if (mp->ctfdata != NULL)
9785084Sjohnlev 		kmem_free(mp->ctfdata, mp->ctfsize);
9795084Sjohnlev 	kmem_free(mp, sizeof (*mp));
9805084Sjohnlev 	kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1);
9815084Sjohnlev 	kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1);
9825084Sjohnlev 	kmem_free(mcp, sizeof (*mcp));
9835084Sjohnlev 	if (names != NULL)
9845084Sjohnlev 		kmem_free(names, namesize);
9855084Sjohnlev }
9865084Sjohnlev 
9875084Sjohnlev void
9885084Sjohnlev xpv_panic_init()
9895084Sjohnlev {
9905084Sjohnlev 	xen_platform_op_t op;
9915084Sjohnlev 	int i;
9925084Sjohnlev 
9935084Sjohnlev 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
9945084Sjohnlev 
9955084Sjohnlev 	for (i = 0; i < mmu.num_level; i++)
9965084Sjohnlev 		ptable_pfn[i] = PFN_INVALID;
9975084Sjohnlev 
9985084Sjohnlev 	/* Let Xen know where to jump if/when it panics. */
9995084Sjohnlev 	op.cmd = XENPF_panic_init;
10005084Sjohnlev 	op.interface_version = XENPF_INTERFACE_VERSION;
10015084Sjohnlev 	op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr;
10025084Sjohnlev 
10035084Sjohnlev 	(void) HYPERVISOR_platform_op(&op);
10045084Sjohnlev 
10055084Sjohnlev 	init_xen_module();
10065084Sjohnlev }
1007