1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 #include <sys/types.h>
26 #include <sys/clock.h>
27 #include <sys/psm.h>
28 #include <sys/archsystm.h>
29 #include <sys/machsystm.h>
30 #include <sys/compress.h>
31 #include <sys/modctl.h>
32 #include <sys/trap.h>
33 #include <sys/panic.h>
34 #include <sys/regset.h>
35 #include <sys/frame.h>
36 #include <sys/kobj.h>
37 #include <sys/apic.h>
38 #include <sys/apic_timer.h>
39 #include <sys/dumphdr.h>
40 #include <sys/mem.h>
41 #include <sys/x86_archext.h>
42 #include <sys/xpv_panic.h>
43 #include <sys/boot_console.h>
44 #include <sys/bootsvcs.h>
45 #include <sys/consdev.h>
46 #include <vm/hat_pte.h>
47 #include <vm/hat_i86.h>
48
49 /* XXX: need to add a PAE version too, if we ever support both PAE and non */
50 #if defined(__i386)
51 #define XPV_FILENAME "/boot/xen-syms"
52 #else
53 #define XPV_FILENAME "/boot/amd64/xen-syms"
54 #endif
55 #define XPV_MODNAME "xpv"
56
57 int xpv_panicking = 0;
58
59 struct module *xpv_module;
60 struct modctl *xpv_modctl;
61
62 #define ALIGN(x, a) ((a) == 0 ? (uintptr_t)(x) : \
63 (((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l)))
64
65 /* Pointer to the xpv_panic_info structure handed to us by Xen. */
66 static struct panic_info *xpv_panic_info = NULL;
67
68 /* Timer support */
69 #define NSEC_SHIFT 5
70 #define T_XPV_TIMER 0xd1
71 #define XPV_TIMER_INTERVAL 1000 /* 1000 microseconds */
72 static uint32_t *xpv_apicadr = NULL;
73 static uint_t nsec_scale;
74
75 /* IDT support */
76 #pragma align 16(xpv_panic_idt)
77 static gate_desc_t xpv_panic_idt[NIDT]; /* interrupt descriptor table */
78
79 /* Xen pagetables mapped into our HAT's ptable windows */
80 static pfn_t ptable_pfn[MAX_NUM_LEVEL];
81
82 /* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */
83 static int xpv_dump_pages;
84
85 /*
86 * There are up to two large swathes of RAM that we don't want to include
87 * in the dump: those that comprise the Xen version of segkpm. On 32-bit
88 * systems there is no such region of memory. On 64-bit systems, there
89 * should be just a single contiguous region that corresponds to all of
90 * physical memory. The tricky bit is that Xen's heap sometimes lives in
91 * the middle of their segkpm, and is mapped using only kpm-like addresses.
92 * In that case, we need to skip the swathes before and after Xen's heap.
93 */
94 uintptr_t kpm1_low = 0;
95 uintptr_t kpm1_high = 0;
96 uintptr_t kpm2_low = 0;
97 uintptr_t kpm2_high = 0;
98
99 /*
100 * Some commonly used values that we don't want to recompute over and over.
101 */
102 static int xpv_panic_nptes[MAX_NUM_LEVEL];
103 static ulong_t xpv_panic_cr3;
104 static uintptr_t xpv_end;
105
106 static void xpv_panic_console_print(const char *fmt, ...);
107 static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print;
108
109 #define CONSOLE_BUF_SIZE 256
110 static char console_buffer[CONSOLE_BUF_SIZE];
111 static boolean_t use_polledio;
112
113 /*
114 * Pointers to machine check panic info (if any).
115 */
116 xpv_mca_panic_data_t *xpv_mca_panic_data = NULL;
117
118 static void
xpv_panic_putc(int m)119 xpv_panic_putc(int m)
120 {
121 struct cons_polledio *c = cons_polledio;
122
123 /* This really shouldn't happen */
124 if (console == CONS_HYPERVISOR)
125 return;
126
127 if (use_polledio == B_TRUE)
128 c->cons_polledio_putchar(c->cons_polledio_argument, m);
129 else
130 bcons_putchar(m);
131 }
132
133 static void
xpv_panic_puts(char * msg)134 xpv_panic_puts(char *msg)
135 {
136 char *m;
137
138 dump_timeleft = dump_timeout;
139 for (m = msg; *m; m++)
140 xpv_panic_putc((int)*m);
141 }
142
143 static void
xpv_panic_console_print(const char * fmt,...)144 xpv_panic_console_print(const char *fmt, ...)
145 {
146 va_list ap;
147
148 va_start(ap, fmt);
149 (void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap);
150 va_end(ap);
151
152 xpv_panic_puts(console_buffer);
153 }
154
155 static void
xpv_panic_map(int level,pfn_t pfn)156 xpv_panic_map(int level, pfn_t pfn)
157 {
158 x86pte_t pte, *pteptr;
159
160 /*
161 * The provided pfn represents a level 'level' page table. Map it
162 * into the 'level' slot in the list of page table windows.
163 */
164 pteptr = (x86pte_t *)PWIN_PTE_VA(level);
165 pte = pfn_to_pa(pfn) | PT_VALID;
166
167 XPV_ALLOW_PAGETABLE_UPDATES();
168 if (mmu.pae_hat)
169 *pteptr = pte;
170 else
171 *(x86pte32_t *)pteptr = pte;
172 XPV_DISALLOW_PAGETABLE_UPDATES();
173
174 mmu_tlbflush_entry(PWIN_VA(level));
175 }
176
177 /*
178 * Walk the page tables to find the pfn mapped by the given va.
179 */
180 static pfn_t
xpv_va_walk(uintptr_t * vaddr)181 xpv_va_walk(uintptr_t *vaddr)
182 {
183 int l, idx;
184 pfn_t pfn;
185 x86pte_t pte;
186 x86pte_t *ptep;
187 uintptr_t va = *vaddr;
188 uintptr_t scan_va;
189 caddr_t ptable_window;
190 static pfn_t toplevel_pfn;
191 static uintptr_t lastva;
192
193 /*
194 * If we do anything other than a simple scan through memory, don't
195 * trust the mapped page tables.
196 */
197 if (va != lastva + MMU_PAGESIZE)
198 for (l = mmu.max_level; l >= 0; l--)
199 ptable_pfn[l] = PFN_INVALID;
200
201 toplevel_pfn = mmu_btop(xpv_panic_cr3);
202
203 while (va < xpv_end && va >= *vaddr) {
204 /* Find the lowest table with any entry for va */
205 pfn = toplevel_pfn;
206 for (l = mmu.max_level; l >= 0; l--) {
207 if (ptable_pfn[l] != pfn) {
208 xpv_panic_map(l, pfn);
209 ptable_pfn[l] = pfn;
210 }
211
212 /*
213 * Search this pagetable for any mapping to an
214 * address >= va.
215 */
216 ptable_window = PWIN_VA(l);
217 if (l == mmu.max_level && mmu.pae_hat)
218 ptable_window +=
219 (xpv_panic_cr3 & MMU_PAGEOFFSET);
220
221 idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1);
222 scan_va = va;
223 while (idx < xpv_panic_nptes[l] && scan_va < xpv_end &&
224 scan_va >= *vaddr) {
225 ptep = (x86pte_t *)(ptable_window +
226 (idx << mmu.pte_size_shift));
227 pte = GET_PTE(ptep);
228 if (pte & PTE_VALID)
229 break;
230 idx++;
231 scan_va += mmu.level_size[l];
232 }
233
234 /*
235 * If there are no valid mappings in this table, we
236 * can skip to the end of the VA range it covers.
237 */
238 if (idx == xpv_panic_nptes[l]) {
239 va = NEXT_ENTRY_VA(va, l + 1);
240 break;
241 }
242
243 va = scan_va;
244 /*
245 * See if we've hit the end of the range.
246 */
247 if (va >= xpv_end || va < *vaddr)
248 break;
249
250 /*
251 * If this mapping is for a pagetable, we drop down
252 * to the next level in the hierarchy and look for
253 * a mapping in it.
254 */
255 pfn = PTE2MFN(pte, l);
256 if (!PTE_ISPAGE(pte, l))
257 continue;
258
259 /*
260 * The APIC page is magic. Nothing to see here;
261 * move along.
262 */
263 if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) ==
264 (va & MMU_PAGEMASK)) {
265 va += MMU_PAGESIZE;
266 break;
267 }
268
269 /*
270 * See if the address is within one of the two
271 * kpm-like regions we want to skip.
272 */
273 if (va >= kpm1_low && va < kpm1_high) {
274 va = kpm1_high;
275 break;
276 }
277 if (va >= kpm2_low && va < kpm2_high) {
278 va = kpm2_high;
279 break;
280 }
281
282 /*
283 * The Xen panic code only handles small pages. If
284 * this mapping is for a large page, we need to
285 * identify the consituent page that covers the
286 * specific VA we were looking for.
287 */
288 if (l > 0) {
289 if (l > 1)
290 panic("Xen panic can't cope with "
291 "giant pages.");
292 idx = (va >> LEVEL_SHIFT(0)) &
293 (xpv_panic_nptes[0] - 1);
294 pfn += idx;
295 }
296
297 *vaddr = va;
298 lastva = va;
299 return (pfn | PFN_IS_FOREIGN_MFN);
300 }
301 }
302 return (PFN_INVALID);
303 }
304
305 /*
306 * Walk through the Xen VA space, finding pages that are mapped in.
307 *
308 * These pages all have MFNs rather than PFNs, meaning they may be outside
309 * the physical address space the kernel knows about, or they may collide
310 * with PFNs the kernel is using.
311 *
312 * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs
313 * to avoid collisions doesn't work. The pages need to be written to disk
314 * in PFN-order or savecore gets confused. We can't allocate memory to
315 * contruct a sorted pfn->VA reverse mapping, so we have to write the pages
316 * to disk in VA order.
317 *
318 * To square this circle, we simply make up PFNs for each of Xen's pages.
319 * We assign each mapped page a fake PFN in ascending order. These fake
320 * PFNs each have the FOREIGN bit set, ensuring that they fall outside the
321 * range of Solaris PFNs written by the kernel.
322 */
323 int
dump_xpv_addr()324 dump_xpv_addr()
325 {
326 uintptr_t va;
327 mem_vtop_t mem_vtop;
328
329 xpv_dump_pages = 0;
330 va = xen_virt_start;
331
332 while (xpv_va_walk(&va) != PFN_INVALID) {
333 mem_vtop.m_as = &kas;
334 mem_vtop.m_va = (void *)va;
335 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
336
337 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
338 xpv_dump_pages++;
339
340 va += MMU_PAGESIZE;
341 }
342
343 /*
344 * Add the shared_info page. This page actually ends up in the
345 * dump twice: once for the Xen va and once for the Solaris va.
346 * This isn't ideal, but we don't know the address Xen is using for
347 * the page, so we can't share it.
348 */
349 mem_vtop.m_as = &kas;
350 mem_vtop.m_va = HYPERVISOR_shared_info;
351 mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
352 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
353 xpv_dump_pages++;
354
355 return (xpv_dump_pages);
356 }
357
358 void
dump_xpv_pfn()359 dump_xpv_pfn()
360 {
361 pfn_t pfn;
362 int cnt;
363
364 for (cnt = 0; cnt < xpv_dump_pages; cnt++) {
365 pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN;
366 dumpvp_write(&pfn, sizeof (pfn));
367 }
368 }
369
370 int
dump_xpv_data(void * dump_cbuf)371 dump_xpv_data(void *dump_cbuf)
372 {
373 uintptr_t va;
374 uint32_t csize;
375 int cnt = 0;
376
377 /*
378 * XXX: we should probably run this data through a UE check. The
379 * catch is that the UE code relies on on_trap() and getpfnum()
380 * working.
381 */
382 va = xen_virt_start;
383
384 while (xpv_va_walk(&va) != PFN_INVALID) {
385 csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE);
386 dumpvp_write(&csize, sizeof (uint32_t));
387 dumpvp_write(dump_cbuf, csize);
388 if (dump_ioerr) {
389 dumphdr->dump_flags &= ~DF_COMPLETE;
390 return (cnt);
391 }
392 cnt++;
393 va += MMU_PAGESIZE;
394 }
395
396 /*
397 * Finally, dump the shared_info page
398 */
399 csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf,
400 PAGESIZE);
401 dumpvp_write(&csize, sizeof (uint32_t));
402 dumpvp_write(dump_cbuf, csize);
403 if (dump_ioerr)
404 dumphdr->dump_flags &= ~DF_COMPLETE;
405 cnt++;
406
407 return (cnt);
408 }
409
410 static void *
showstack(void * fpreg,int xpv_only)411 showstack(void *fpreg, int xpv_only)
412 {
413 struct frame *fpp;
414 ulong_t off;
415 char *sym;
416 uintptr_t pc, fp, lastfp;
417 uintptr_t minaddr = min(KERNELBASE, xen_virt_start);
418
419 fp = (uintptr_t)fpreg;
420 if (fp < minaddr) {
421 xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg);
422 return (fpreg);
423 }
424
425 do {
426 fpp = (struct frame *)fp;
427 pc = fpp->fr_savpc;
428
429 if ((xpv_only != 0) &&
430 (fp > xpv_end || fp < xen_virt_start))
431 break;
432 if ((sym = kobj_getsymname(pc, &off)) != NULL)
433 xpv_panic_printf("%08lx %s:%s+%lx\n", fp,
434 mod_containing_pc((caddr_t)pc), sym, off);
435 else if ((pc >= xen_virt_start) && (pc <= xpv_end))
436 xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc);
437 else
438 xpv_panic_printf("%08lx %lx\n", fp, pc);
439
440 lastfp = fp;
441 fp = fpp->fr_savfp;
442
443 /*
444 * Xen marks an exception frame by inverting the frame
445 * pointer.
446 */
447 if (fp < lastfp) {
448 if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff)
449 fp = ~fp;
450 }
451 } while (fp > lastfp);
452 return ((void *)fp);
453 }
454
455 void *
xpv_traceback(void * fpreg)456 xpv_traceback(void *fpreg)
457 {
458 return (showstack(fpreg, 1));
459 }
460
461 #if defined(__amd64)
462 static void
xpv_panic_hypercall(ulong_t call)463 xpv_panic_hypercall(ulong_t call)
464 {
465 panic("Illegally issued hypercall %d during panic!\n", (int)call);
466 }
467 #endif
468
469 void
xpv_die(struct regs * rp)470 xpv_die(struct regs *rp)
471 {
472 struct panic_trap_info ti;
473 struct cregs creg;
474
475 ti.trap_regs = rp;
476 ti.trap_type = rp->r_trapno;
477
478 curthread->t_panic_trap = &ti;
479 if (ti.trap_type == T_PGFLT) {
480 getcregs(&creg);
481 ti.trap_addr = (caddr_t)creg.cr_cr2;
482 panic("Fatal pagefault at 0x%lx. fault addr=0x%p rp=0x%p",
483 rp->r_pc, (void *)ti.trap_addr, (void *)rp);
484 } else {
485 ti.trap_addr = (caddr_t)rp->r_pc;
486 panic("Fatal trap %ld at 0x%lx. rp=0x%p", rp->r_trapno,
487 rp->r_pc, (void *)rp);
488 }
489 }
490
491 /*
492 * Build IDT to handle a Xen panic
493 */
494 static void
switch_to_xpv_panic_idt()495 switch_to_xpv_panic_idt()
496 {
497 int i;
498 desctbr_t idtr;
499 gate_desc_t *idt = xpv_panic_idt;
500 selector_t cs = get_cs_register();
501
502 for (i = 0; i < 32; i++)
503 set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL,
504 0);
505
506 set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL,
507 0);
508 set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
509 set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0);
510 set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT,
511 TRP_XPL, 0);
512 set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL,
513 0);
514 set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL,
515 0);
516 set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL,
517 0);
518 set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL,
519 0);
520 set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0);
521 set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0);
522 set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0);
523 set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL,
524 0);
525 set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL,
526 0);
527 set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0);
528 set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
529
530 /*
531 * We have no double fault handler. Any single fault represents a
532 * catastrophic failure for us, so there is no attempt to handle
533 * them cleanly: we just print a message and reboot. If we
534 * encounter a second fault while doing that, there is nothing
535 * else we can do.
536 */
537
538 /*
539 * Be prepared to absorb any stray device interrupts received
540 * while writing the core to disk.
541 */
542 for (i = 33; i < NIDT; i++)
543 set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT,
544 TRP_XPL, 0);
545
546 /* The one interrupt we expect to get is from the APIC timer. */
547 set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT,
548 TRP_XPL, 0);
549
550 idtr.dtr_base = (uintptr_t)xpv_panic_idt;
551 idtr.dtr_limit = sizeof (xpv_panic_idt) - 1;
552 wr_idtr(&idtr);
553
554 #if defined(__amd64)
555 /* Catch any hypercalls. */
556 wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall);
557 wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall);
558 #endif
559 }
560
561 static void
xpv_apic_clkinit()562 xpv_apic_clkinit()
563 {
564 uint_t apic_ticks = 0;
565
566 /*
567 * Measure how many APIC ticks there are within a fixed time
568 * period. We're going to be fairly coarse here. This timer is
569 * just being used to detect a stalled panic, so as long as we have
570 * the right order of magnitude, everything should be fine.
571 */
572 xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR;
573 xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK;
574 xpv_apicadr[APIC_INT_VECT0] = AV_MASK; /* local intr reg 0 */
575
576 xpv_apicadr[APIC_DIVIDE_REG] = 0;
577 xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL;
578 drv_usecwait(XPV_TIMER_INTERVAL);
579 apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT];
580
581 /*
582 * apic_ticks now represents roughly how many apic ticks comprise
583 * one timeout interval. Program the timer to send us an interrupt
584 * every time that interval expires.
585 */
586 xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_PERIODIC;
587 xpv_apicadr[APIC_INIT_COUNT] = apic_ticks;
588 xpv_apicadr[APIC_EOI_REG] = 0;
589 }
590
591 void
xpv_timer_tick(void)592 xpv_timer_tick(void)
593 {
594 static int ticks = 0;
595
596 if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) {
597 ticks = 0;
598 if (dump_timeleft && (--dump_timeleft == 0))
599 panic("Xen panic timeout\n");
600 }
601 xpv_apicadr[APIC_EOI_REG] = 0;
602 }
603
604 void
xpv_interrupt(void)605 xpv_interrupt(void)
606 {
607 #ifdef DEBUG
608 static int cnt = 0;
609
610 if (cnt++ < 10)
611 xpv_panic_printf("Unexpected interrupt received.\n");
612 if ((cnt < 1000) && ((cnt % 100) == 0))
613 xpv_panic_printf("%d unexpected interrupts received.\n", cnt);
614 #endif
615
616 xpv_apicadr[APIC_EOI_REG] = 0;
617 }
618
619 /*
620 * Managing time in panic context is trivial. We only have a single CPU,
621 * we never get rescheduled, we never get suspended. We just need to
622 * convert clock ticks into nanoseconds.
623 */
624 static hrtime_t
xpv_panic_gethrtime(void)625 xpv_panic_gethrtime(void)
626 {
627 hrtime_t tsc, hrt;
628 unsigned int *l = (unsigned int *)&(tsc);
629
630 tsc = __rdtsc_insn();
631 hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) +
632 (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT));
633
634 return (hrt);
635 }
636
637 static void
xpv_panic_time_init()638 xpv_panic_time_init()
639 {
640 nsec_scale =
641 CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT;
642
643 gethrtimef = xpv_panic_gethrtime;
644 }
645
646 static void
xpv_panicsys(struct regs * rp,char * fmt,...)647 xpv_panicsys(struct regs *rp, char *fmt, ...)
648 {
649 extern void panicsys(const char *, va_list, struct regs *, int);
650 va_list alist;
651
652 va_start(alist, fmt);
653 panicsys(fmt, alist, rp, 1);
654 va_end(alist);
655 }
656
657 void
xpv_do_panic(void * arg)658 xpv_do_panic(void *arg)
659 {
660 struct panic_info *pip = (struct panic_info *)arg;
661 int l;
662 struct cregs creg;
663 #if defined(__amd64)
664 extern uintptr_t postbootkernelbase;
665 #endif
666
667 if (xpv_panicking++ > 0)
668 panic("multiple calls to xpv_do_panic()");
669
670 /*
671 * Indicate to the underlying panic framework that a panic has been
672 * initiated. This is ordinarily done as part of vpanic(). Since
673 * we already have all the register state saved by the hypervisor,
674 * we skip that and jump straight into the panic processing code.
675 *
676 * XXX If another thread grabs and wins the panic_quiesce trigger
677 * then we'll have two threads in panicsys believing they are in
678 * charge of the panic attempt!
679 */
680 (void) panic_trigger(&panic_quiesce);
681
682 #if defined(__amd64)
683 /*
684 * bzero() and bcopy() get unhappy when asked to operate on
685 * addresses outside of the kernel. At this point Xen is really a
686 * part of the kernel, so we update the routines' notion of where
687 * the kernel starts.
688 */
689 postbootkernelbase = xen_virt_start;
690 #endif
691
692 #if defined(HYPERVISOR_VIRT_END)
693 xpv_end = HYPERVISOR_VIRT_END;
694 #else
695 xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t);
696 #endif
697
698 /*
699 * If we were redirecting console output to the hypervisor, we have
700 * to stop.
701 */
702 use_polledio = B_FALSE;
703 if (console == CONS_HYPERVISOR) {
704 bcons_device_change(CONS_HYPERVISOR);
705 } else if (cons_polledio != NULL &&
706 cons_polledio->cons_polledio_putchar != NULL) {
707 if (cons_polledio->cons_polledio_enter != NULL)
708 cons_polledio->cons_polledio_enter(
709 cons_polledio->cons_polledio_argument);
710 use_polledio = 1;
711 }
712
713 /* Make sure we handle all console output from here on. */
714 sysp->bsvc_putchar = xpv_panic_putc;
715
716 /*
717 * If we find an unsupported panic_info structure, there's not much
718 * we can do other than complain, plow on, and hope for the best.
719 */
720 if (pip->pi_version != PANIC_INFO_VERSION)
721 xpv_panic_printf("Warning: Xen is using an unsupported "
722 "version of the panic_info structure.\n");
723
724 xpv_panic_info = pip;
725
726 #if defined(__amd64)
727 kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start;
728 if (xpv_panic_info->pi_xen_start == NULL) {
729 kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end;
730 } else {
731 kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start;
732 kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end;
733 kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end;
734 }
735 #endif
736
737 /*
738 * Make sure we are running on the Solaris %gs. The Xen panic code
739 * should already have set up the GDT properly.
740 */
741 xpv_panic_resetgs();
742 #if defined(__amd64)
743 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
744 #endif
745
746 xpv_panic_time_init();
747
748 /*
749 * Switch to our own IDT, avoiding any accidental returns to Xen
750 * world.
751 */
752 switch_to_xpv_panic_idt();
753
754 /*
755 * Initialize the APIC timer, which is used to detect a hung dump
756 * attempt.
757 */
758 xpv_apicadr = pip->pi_apic;
759 xpv_apic_clkinit();
760
761 /*
762 * Set up a few values that we'll need repeatedly.
763 */
764 getcregs(&creg);
765 xpv_panic_cr3 = creg.cr_cr3;
766 for (l = mmu.max_level; l >= 0; l--)
767 xpv_panic_nptes[l] = mmu.ptes_per_table;
768 #ifdef __i386
769 if (mmu.pae_hat)
770 xpv_panic_nptes[mmu.max_level] = 4;
771 #endif
772
773 /* Add the fake Xen module to the module list */
774 if (xpv_module != NULL) {
775 extern int last_module_id;
776
777 xpv_modctl->mod_id = last_module_id++;
778 xpv_modctl->mod_next = &modules;
779 xpv_modctl->mod_prev = modules.mod_prev;
780 modules.mod_prev->mod_next = xpv_modctl;
781 modules.mod_prev = xpv_modctl;
782 }
783
784 if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC)
785 xpv_mca_panic_data = &pip->pi_mca;
786
787 xpv_panic_printf = printf;
788 xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr);
789 xpv_panic_printf("Failed to reboot following panic.\n");
790 for (;;)
791 ;
792 }
793
794 /*
795 * Set up the necessary data structures to pretend that the Xen hypervisor
796 * is a loadable module, allowing mdb to find the Xen symbols in a crash
797 * dump. Since these symbols all map to VA space Solaris doesn't normally
798 * have access to, we don't link these structures into the kernel's lists
799 * until/unless we hit a Xen panic.
800 *
801 * The observant reader will note a striking amount of overlap between this
802 * code and that found in krtld. While it would be handy if we could just
803 * ask krtld to do this work for us, it's not that simple. Among the
804 * complications: we're not actually loading the text here (grub did it at
805 * boot), the .text section is writable, there are no relocations to do,
806 * none of the module text/data is in readable memory, etc. Training krtld
807 * to deal with this weird module is as complicated, and more risky, than
808 * reimplementing the necessary subset of it here.
809 */
810 static void
init_xen_module()811 init_xen_module()
812 {
813 struct _buf *file = NULL;
814 struct module *mp;
815 struct modctl *mcp;
816 int i, shn;
817 Shdr *shp, *ctf_shp;
818 char *names = NULL;
819 size_t n, namesize, text_align, data_align;
820 #if defined(__amd64)
821 const char machine = EM_AMD64;
822 #else
823 const char machine = EM_386;
824 #endif
825
826 /* Allocate and init the module structure */
827 mp = kmem_zalloc(sizeof (*mp), KM_SLEEP);
828 mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
829 (void) strcpy(mp->filename, XPV_FILENAME);
830
831 /* Allocate and init the modctl structure */
832 mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP);
833 mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP);
834 (void) strcpy(mcp->mod_modname, XPV_MODNAME);
835 mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
836 (void) strcpy(mcp->mod_filename, XPV_FILENAME);
837 mcp->mod_inprogress_thread = (kthread_id_t)-1;
838 mcp->mod_ref = 1;
839 mcp->mod_loaded = 1;
840 mcp->mod_loadcnt = 1;
841 mcp->mod_mp = mp;
842
843 /*
844 * Try to open a Xen image that hasn't had its symbol and CTF
845 * information stripped off.
846 */
847 file = kobj_open_file(XPV_FILENAME);
848 if (file == (struct _buf *)-1) {
849 file = NULL;
850 goto err;
851 }
852
853 /*
854 * Read the header and ensure that this is an ELF file for the
855 * proper ISA. If it's not, somebody has done something very
856 * stupid. Why bother? See Mencken.
857 */
858 if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0)
859 goto err;
860 for (i = 0; i < SELFMAG; i++)
861 if (mp->hdr.e_ident[i] != ELFMAG[i])
862 goto err;
863 if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) ||
864 (mp->hdr.e_machine != machine))
865 goto err;
866
867 /* Read in the section headers */
868 n = mp->hdr.e_shentsize * mp->hdr.e_shnum;
869 mp->shdrs = kmem_zalloc(n, KM_SLEEP);
870 if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0)
871 goto err;
872
873 /* Read the section names */
874 shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize);
875 namesize = shp->sh_size;
876 names = kmem_zalloc(shp->sh_size, KM_SLEEP);
877 if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0)
878 goto err;
879
880 /*
881 * Fill in the text and data size fields.
882 */
883 ctf_shp = NULL;
884 text_align = data_align = 0;
885 for (shn = 1; shn < mp->hdr.e_shnum; shn++) {
886 shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize);
887
888 /* Sanity check the offset of the section name */
889 if (shp->sh_name >= namesize)
890 continue;
891
892 /* If we find the symtab section, remember it for later. */
893 if (shp->sh_type == SHT_SYMTAB) {
894 mp->symtbl_section = shn;
895 mp->symhdr = shp;
896 continue;
897 }
898
899 /* If we find the CTF section, remember it for later. */
900 if ((shp->sh_size != 0) &&
901 (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) {
902 ctf_shp = shp;
903 continue;
904 }
905
906 if (!(shp->sh_flags & SHF_ALLOC))
907 continue;
908
909 /*
910 * Xen marks its text section as writable, so we need to
911 * look for the name - not just the flag.
912 */
913 if ((strcmp(&names[shp->sh_name], ".text") != NULL) &&
914 (shp->sh_flags & SHF_WRITE) != 0) {
915 if (shp->sh_addralign > data_align)
916 data_align = shp->sh_addralign;
917 mp->data_size = ALIGN(mp->data_size, data_align);
918 mp->data_size += ALIGN(shp->sh_size, 8);
919 if (mp->data == NULL || mp->data > (char *)shp->sh_addr)
920 mp->data = (char *)shp->sh_addr;
921 } else {
922 if (shp->sh_addralign > text_align)
923 text_align = shp->sh_addralign;
924 mp->text_size = ALIGN(mp->text_size, text_align);
925 mp->text_size += ALIGN(shp->sh_size, 8);
926 if (mp->text == NULL || mp->text > (char *)shp->sh_addr)
927 mp->text = (char *)shp->sh_addr;
928 }
929 }
930 kmem_free(names, namesize);
931 names = NULL;
932 shp = NULL;
933 mcp->mod_text = mp->text;
934 mcp->mod_text_size = mp->text_size;
935
936 /*
937 * If we have symbol table and string table sections, read them in
938 * now. If we don't, we just plow on. We'll still get a valid
939 * core dump, but finding anything useful will be just a bit
940 * harder.
941 *
942 * Note: we don't bother with a hash table. We'll never do a
943 * symbol lookup unless we crash, and then mdb creates its own. We
944 * also don't try to perform any relocations. Xen should be loaded
945 * exactly where the ELF file indicates, and the symbol information
946 * in the file should be complete and correct already. Static
947 * linking ain't all bad.
948 */
949 if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) {
950 mp->strhdr = (Shdr *)
951 (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize);
952 mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize;
953
954 /* Allocate space for the symbol table and strings. */
955 mp->symsize = mp->symhdr->sh_size +
956 mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size;
957 mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP);
958 mp->symtbl = mp->symspace;
959 mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size);
960
961 if ((kobj_read_file(file, mp->symtbl,
962 mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) ||
963 (kobj_read_file(file, mp->strings,
964 mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0))
965 goto err;
966 }
967
968 /*
969 * Read in the CTF section
970 */
971 if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) {
972 mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP);
973 mp->ctfsize = ctf_shp->sh_size;
974 if (kobj_read_file(file, mp->ctfdata, mp->ctfsize,
975 ctf_shp->sh_offset) < 0)
976 goto err;
977 }
978
979 kobj_close_file(file);
980
981 xpv_module = mp;
982 xpv_modctl = mcp;
983 return;
984
985 err:
986 cmn_err(CE_WARN, "Failed to initialize xpv module.");
987 if (file != NULL)
988 kobj_close_file(file);
989
990 kmem_free(mp->filename, strlen(XPV_FILENAME) + 1);
991 if (mp->shdrs != NULL)
992 kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum);
993 if (mp->symspace != NULL)
994 kmem_free(mp->symspace, mp->symsize);
995 if (mp->ctfdata != NULL)
996 kmem_free(mp->ctfdata, mp->ctfsize);
997 kmem_free(mp, sizeof (*mp));
998 kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1);
999 kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1);
1000 kmem_free(mcp, sizeof (*mcp));
1001 if (names != NULL)
1002 kmem_free(names, namesize);
1003 }
1004
1005 void
xpv_panic_init()1006 xpv_panic_init()
1007 {
1008 xen_platform_op_t op;
1009 int i;
1010
1011 ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
1012
1013 for (i = 0; i < mmu.num_level; i++)
1014 ptable_pfn[i] = PFN_INVALID;
1015
1016 /* Let Xen know where to jump if/when it panics. */
1017 op.cmd = XENPF_panic_init;
1018 op.interface_version = XENPF_INTERFACE_VERSION;
1019 op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr;
1020
1021 (void) HYPERVISOR_platform_op(&op);
1022
1023 init_xen_module();
1024 }
1025