10Sstevel@tonic-gate /*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
52181Sayznaga * Common Development and Distribution License (the "License").
62181Sayznaga * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate /*
22*11759SAnthony.Yznaga@Sun.COM * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
230Sstevel@tonic-gate * Use is subject to license terms.
240Sstevel@tonic-gate */
250Sstevel@tonic-gate
260Sstevel@tonic-gate #include <sys/types.h>
270Sstevel@tonic-gate #include <sys/machsystm.h>
283199Sep32863 #include <sys/sysmacros.h>
290Sstevel@tonic-gate #include <sys/cpuvar.h>
300Sstevel@tonic-gate #include <sys/async.h>
310Sstevel@tonic-gate #include <sys/ontrap.h>
320Sstevel@tonic-gate #include <sys/ddifm.h>
330Sstevel@tonic-gate #include <sys/hypervisor_api.h>
340Sstevel@tonic-gate #include <sys/errorq.h>
350Sstevel@tonic-gate #include <sys/promif.h>
360Sstevel@tonic-gate #include <sys/prom_plat.h>
370Sstevel@tonic-gate #include <sys/x_call.h>
380Sstevel@tonic-gate #include <sys/error.h>
390Sstevel@tonic-gate #include <sys/fm/util.h>
40541Srf157361 #include <sys/ivintr.h>
413156Sgirish #include <sys/archsystm.h>
420Sstevel@tonic-gate
430Sstevel@tonic-gate #define MAX_CE_FLTS 10
440Sstevel@tonic-gate #define MAX_ASYNC_FLTS 6
450Sstevel@tonic-gate
460Sstevel@tonic-gate errorq_t *ue_queue; /* queue of uncorrectable errors */
470Sstevel@tonic-gate errorq_t *ce_queue; /* queue of correctable errors */
4811354SAnthony.Yznaga@Sun.COM errorq_t *errh_queue; /* queue of sun4v error reports */
490Sstevel@tonic-gate
500Sstevel@tonic-gate /*
510Sstevel@tonic-gate * Being used by memory test driver.
520Sstevel@tonic-gate * ce_verbose_memory - covers CEs in DIMMs
530Sstevel@tonic-gate * ce_verbose_other - covers "others" (ecache, IO, etc.)
540Sstevel@tonic-gate *
550Sstevel@tonic-gate * If the value is 0, nothing is logged.
560Sstevel@tonic-gate * If the value is 1, the error is logged to the log file, but not console.
570Sstevel@tonic-gate * If the value is 2, the error is logged to the log file and console.
580Sstevel@tonic-gate */
590Sstevel@tonic-gate int ce_verbose_memory = 1;
600Sstevel@tonic-gate int ce_verbose_other = 1;
610Sstevel@tonic-gate
620Sstevel@tonic-gate int ce_show_data = 0;
630Sstevel@tonic-gate int ce_debug = 0;
640Sstevel@tonic-gate int ue_debug = 0;
650Sstevel@tonic-gate int reset_debug = 0;
660Sstevel@tonic-gate
670Sstevel@tonic-gate /*
680Sstevel@tonic-gate * Tunables for controlling the handling of asynchronous faults (AFTs). Setting
690Sstevel@tonic-gate * these to non-default values on a non-DEBUG kernel is NOT supported.
700Sstevel@tonic-gate */
710Sstevel@tonic-gate int aft_verbose = 0; /* log AFT messages > 1 to log only */
720Sstevel@tonic-gate int aft_panic = 0; /* panic (not reboot) on fatal usermode AFLT */
730Sstevel@tonic-gate int aft_testfatal = 0; /* force all AFTs to panic immediately */
740Sstevel@tonic-gate
750Sstevel@tonic-gate /*
764612Srf157361 * Used for vbsc hostshutdown (power-off button)
77541Srf157361 */
78541Srf157361 int err_shutdown_triggered = 0; /* only once */
792973Sgovinda uint64_t err_shutdown_inum = 0; /* used to pull the trigger */
80541Srf157361
81541Srf157361 /*
824612Srf157361 * Used to print NRE/RE via system variable or kmdb
834612Srf157361 */
844612Srf157361 int printerrh = 0; /* see /etc/system */
854612Srf157361 static void errh_er_print(errh_er_t *, const char *);
864612Srf157361 kmutex_t errh_print_lock;
874612Srf157361
884612Srf157361 /*
890Sstevel@tonic-gate * Defined in bus_func.c but initialised in error_init
900Sstevel@tonic-gate */
910Sstevel@tonic-gate extern kmutex_t bfd_lock;
920Sstevel@tonic-gate
930Sstevel@tonic-gate static uint32_t rq_overflow_count = 0; /* counter for rq overflow */
940Sstevel@tonic-gate
950Sstevel@tonic-gate static void cpu_queue_one_event(errh_async_flt_t *);
960Sstevel@tonic-gate static uint32_t count_entries_on_queue(uint64_t, uint64_t, uint32_t);
97917Selowe static void errh_page_retire(errh_async_flt_t *, uchar_t);
980Sstevel@tonic-gate static int errh_error_protected(struct regs *, struct async_flt *, int *);
990Sstevel@tonic-gate static void errh_rq_full(struct async_flt *);
1000Sstevel@tonic-gate static void ue_drain(void *, struct async_flt *, errorq_elem_t *);
1010Sstevel@tonic-gate static void ce_drain(void *, struct async_flt *, errorq_elem_t *);
10211354SAnthony.Yznaga@Sun.COM static void errh_drain(void *, errh_er_t *, errorq_elem_t *);
1033156Sgirish static void errh_handle_attr(errh_async_flt_t *);
1043156Sgirish static void errh_handle_asr(errh_async_flt_t *);
10511354SAnthony.Yznaga@Sun.COM static void errh_handle_sp(errh_er_t *);
10611304SJanie.Lu@Sun.COM static void sp_ereport_post(uint8_t);
1070Sstevel@tonic-gate
1080Sstevel@tonic-gate /*ARGSUSED*/
1090Sstevel@tonic-gate void
process_resumable_error(struct regs * rp,uint32_t head_offset,uint32_t tail_offset)1100Sstevel@tonic-gate process_resumable_error(struct regs *rp, uint32_t head_offset,
1110Sstevel@tonic-gate uint32_t tail_offset)
1120Sstevel@tonic-gate {
1130Sstevel@tonic-gate struct machcpu *mcpup;
1140Sstevel@tonic-gate struct async_flt *aflt;
1150Sstevel@tonic-gate errh_async_flt_t errh_flt;
1160Sstevel@tonic-gate errh_er_t *head_va;
1170Sstevel@tonic-gate
1180Sstevel@tonic-gate mcpup = &(CPU->cpu_m);
1190Sstevel@tonic-gate
1200Sstevel@tonic-gate while (head_offset != tail_offset) {
1210Sstevel@tonic-gate /* kernel buffer starts right after the resumable queue */
1220Sstevel@tonic-gate head_va = (errh_er_t *)(mcpup->cpu_rq_va + head_offset +
1230Sstevel@tonic-gate CPU_RQ_SIZE);
1240Sstevel@tonic-gate /* Copy the error report to local buffer */
1250Sstevel@tonic-gate bzero(&errh_flt, sizeof (errh_async_flt_t));
1260Sstevel@tonic-gate bcopy((char *)head_va, &(errh_flt.errh_er),
1270Sstevel@tonic-gate sizeof (errh_er_t));
1280Sstevel@tonic-gate
1294612Srf157361 mcpup->cpu_rq_lastre = head_va;
1304612Srf157361 if (printerrh)
1314612Srf157361 errh_er_print(&errh_flt.errh_er, "RQ");
1324612Srf157361
1330Sstevel@tonic-gate /* Increment the queue head */
1340Sstevel@tonic-gate head_offset += Q_ENTRY_SIZE;
1350Sstevel@tonic-gate /* Wrap around */
1360Sstevel@tonic-gate head_offset &= (CPU_RQ_SIZE - 1);
1370Sstevel@tonic-gate
1380Sstevel@tonic-gate /* set error handle to zero so it can hold new error report */
1390Sstevel@tonic-gate head_va->ehdl = 0;
1400Sstevel@tonic-gate
1410Sstevel@tonic-gate switch (errh_flt.errh_er.desc) {
1420Sstevel@tonic-gate case ERRH_DESC_UCOR_RE:
1433156Sgirish /*
1443156Sgirish * Check error attribute, handle individual error
1453156Sgirish * if it is needed.
1463156Sgirish */
1473156Sgirish errh_handle_attr(&errh_flt);
1480Sstevel@tonic-gate break;
1490Sstevel@tonic-gate
150541Srf157361 case ERRH_DESC_WARN_RE:
151541Srf157361 /*
152541Srf157361 * Power-off requested, but handle it one time only.
153541Srf157361 */
154541Srf157361 if (!err_shutdown_triggered) {
155541Srf157361 setsoftint(err_shutdown_inum);
156541Srf157361 ++err_shutdown_triggered;
157541Srf157361 }
158541Srf157361 continue;
159541Srf157361
16011304SJanie.Lu@Sun.COM case ERRH_DESC_SP:
16111304SJanie.Lu@Sun.COM /*
16211304SJanie.Lu@Sun.COM * The state of the SP has changed.
16311304SJanie.Lu@Sun.COM */
16411354SAnthony.Yznaga@Sun.COM errorq_dispatch(errh_queue, &errh_flt.errh_er,
16511354SAnthony.Yznaga@Sun.COM sizeof (errh_er_t), ERRORQ_ASYNC);
16611304SJanie.Lu@Sun.COM continue;
16711304SJanie.Lu@Sun.COM
1680Sstevel@tonic-gate default:
1690Sstevel@tonic-gate cmn_err(CE_WARN, "Error Descriptor 0x%llx "
1700Sstevel@tonic-gate " invalid in resumable error handler",
1710Sstevel@tonic-gate (long long) errh_flt.errh_er.desc);
1720Sstevel@tonic-gate continue;
1730Sstevel@tonic-gate }
1740Sstevel@tonic-gate
1750Sstevel@tonic-gate aflt = (struct async_flt *)&(errh_flt.cmn_asyncflt);
1760Sstevel@tonic-gate aflt->flt_id = gethrtime();
1770Sstevel@tonic-gate aflt->flt_bus_id = getprocessorid();
1780Sstevel@tonic-gate aflt->flt_class = CPU_FAULT;
1790Sstevel@tonic-gate aflt->flt_prot = AFLT_PROT_NONE;
1800Sstevel@tonic-gate aflt->flt_priv = (((errh_flt.errh_er.attr & ERRH_MODE_MASK)
1810Sstevel@tonic-gate >> ERRH_MODE_SHIFT) == ERRH_MODE_PRIV);
1820Sstevel@tonic-gate
1830Sstevel@tonic-gate if (errh_flt.errh_er.attr & ERRH_ATTR_CPU)
1840Sstevel@tonic-gate /* If it is an error on other cpu */
1850Sstevel@tonic-gate aflt->flt_panic = 1;
1860Sstevel@tonic-gate else
1870Sstevel@tonic-gate aflt->flt_panic = 0;
1880Sstevel@tonic-gate
1890Sstevel@tonic-gate /*
1900Sstevel@tonic-gate * Handle resumable queue full case.
1910Sstevel@tonic-gate */
1920Sstevel@tonic-gate if (errh_flt.errh_er.attr & ERRH_ATTR_RQF) {
1930Sstevel@tonic-gate (void) errh_rq_full(aflt);
1940Sstevel@tonic-gate }
1950Sstevel@tonic-gate
1960Sstevel@tonic-gate /*
1970Sstevel@tonic-gate * Queue the error on ce or ue queue depend on flt_panic.
1980Sstevel@tonic-gate * Even if flt_panic is set, the code still keep processing
1990Sstevel@tonic-gate * the rest element on rq until the panic starts.
2000Sstevel@tonic-gate */
2010Sstevel@tonic-gate (void) cpu_queue_one_event(&errh_flt);
2020Sstevel@tonic-gate
2030Sstevel@tonic-gate /*
2040Sstevel@tonic-gate * Panic here if aflt->flt_panic has been set.
2050Sstevel@tonic-gate * Enqueued errors will be logged as part of the panic flow.
2060Sstevel@tonic-gate */
2070Sstevel@tonic-gate if (aflt->flt_panic) {
2080Sstevel@tonic-gate fm_panic("Unrecoverable error on another CPU");
2090Sstevel@tonic-gate }
2100Sstevel@tonic-gate }
2110Sstevel@tonic-gate }
2120Sstevel@tonic-gate
2130Sstevel@tonic-gate void
process_nonresumable_error(struct regs * rp,uint64_t flags,uint32_t head_offset,uint32_t tail_offset)2141457Swh94709 process_nonresumable_error(struct regs *rp, uint64_t flags,
2150Sstevel@tonic-gate uint32_t head_offset, uint32_t tail_offset)
2160Sstevel@tonic-gate {
2170Sstevel@tonic-gate struct machcpu *mcpup;
2180Sstevel@tonic-gate struct async_flt *aflt;
2190Sstevel@tonic-gate errh_async_flt_t errh_flt;
2200Sstevel@tonic-gate errh_er_t *head_va;
2210Sstevel@tonic-gate int trampolined = 0;
2220Sstevel@tonic-gate int expected = DDI_FM_ERR_UNEXPECTED;
2230Sstevel@tonic-gate uint64_t exec_mode;
2241457Swh94709 uint8_t u_spill_fill;
2250Sstevel@tonic-gate
2260Sstevel@tonic-gate mcpup = &(CPU->cpu_m);
2270Sstevel@tonic-gate
2280Sstevel@tonic-gate while (head_offset != tail_offset) {
2290Sstevel@tonic-gate /* kernel buffer starts right after the nonresumable queue */
2300Sstevel@tonic-gate head_va = (errh_er_t *)(mcpup->cpu_nrq_va + head_offset +
2310Sstevel@tonic-gate CPU_NRQ_SIZE);
2320Sstevel@tonic-gate
2330Sstevel@tonic-gate /* Copy the error report to local buffer */
2340Sstevel@tonic-gate bzero(&errh_flt, sizeof (errh_async_flt_t));
2350Sstevel@tonic-gate
2360Sstevel@tonic-gate bcopy((char *)head_va, &(errh_flt.errh_er),
2370Sstevel@tonic-gate sizeof (errh_er_t));
2380Sstevel@tonic-gate
2394612Srf157361 mcpup->cpu_nrq_lastnre = head_va;
2404612Srf157361 if (printerrh)
2414612Srf157361 errh_er_print(&errh_flt.errh_er, "NRQ");
2424612Srf157361
2430Sstevel@tonic-gate /* Increment the queue head */
2440Sstevel@tonic-gate head_offset += Q_ENTRY_SIZE;
2450Sstevel@tonic-gate /* Wrap around */
2460Sstevel@tonic-gate head_offset &= (CPU_NRQ_SIZE - 1);
2470Sstevel@tonic-gate
2480Sstevel@tonic-gate /* set error handle to zero so it can hold new error report */
2490Sstevel@tonic-gate head_va->ehdl = 0;
2500Sstevel@tonic-gate
2510Sstevel@tonic-gate aflt = (struct async_flt *)&(errh_flt.cmn_asyncflt);
2520Sstevel@tonic-gate
2530Sstevel@tonic-gate trampolined = 0;
2540Sstevel@tonic-gate
2550Sstevel@tonic-gate if (errh_flt.errh_er.attr & ERRH_ATTR_PIO)
2560Sstevel@tonic-gate aflt->flt_class = BUS_FAULT;
2570Sstevel@tonic-gate else
2580Sstevel@tonic-gate aflt->flt_class = CPU_FAULT;
2590Sstevel@tonic-gate
2600Sstevel@tonic-gate aflt->flt_id = gethrtime();
2610Sstevel@tonic-gate aflt->flt_bus_id = getprocessorid();
2620Sstevel@tonic-gate aflt->flt_pc = (caddr_t)rp->r_pc;
2630Sstevel@tonic-gate exec_mode = (errh_flt.errh_er.attr & ERRH_MODE_MASK)
2640Sstevel@tonic-gate >> ERRH_MODE_SHIFT;
2650Sstevel@tonic-gate aflt->flt_priv = (exec_mode == ERRH_MODE_PRIV ||
2660Sstevel@tonic-gate exec_mode == ERRH_MODE_UNKNOWN);
2670Sstevel@tonic-gate aflt->flt_prot = AFLT_PROT_NONE;
2681457Swh94709 aflt->flt_tl = (uchar_t)(flags & ERRH_TL_MASK);
2690Sstevel@tonic-gate aflt->flt_panic = ((aflt->flt_tl != 0) ||
2700Sstevel@tonic-gate (aft_testfatal != 0));
2710Sstevel@tonic-gate
2721457Swh94709 /*
2731457Swh94709 * For the first error packet on the queue, check if it
2741457Swh94709 * happened in user fill/spill trap.
2751457Swh94709 */
2761457Swh94709 if (flags & ERRH_U_SPILL_FILL) {
2771457Swh94709 u_spill_fill = 1;
2781457Swh94709 /* clear the user fill/spill flag in flags */
2791457Swh94709 flags = (uint64_t)aflt->flt_tl;
2801457Swh94709 } else
2811457Swh94709 u_spill_fill = 0;
2821457Swh94709
2830Sstevel@tonic-gate switch (errh_flt.errh_er.desc) {
2840Sstevel@tonic-gate case ERRH_DESC_PR_NRE:
2851457Swh94709 if (u_spill_fill) {
2861457Swh94709 aflt->flt_panic = 0;
2871457Swh94709 break;
2881457Swh94709 }
2890Sstevel@tonic-gate /*
29010271SJason.Beloro@Sun.COM * Fall through, precise fault also need to check
29110271SJason.Beloro@Sun.COM * to see if it was protected.
2920Sstevel@tonic-gate */
2931457Swh94709 /*FALLTHRU*/
2940Sstevel@tonic-gate
2950Sstevel@tonic-gate case ERRH_DESC_DEF_NRE:
2960Sstevel@tonic-gate /*
2970Sstevel@tonic-gate * If the trap occurred in privileged mode at TL=0,
2980Sstevel@tonic-gate * we need to check to see if we were executing
2990Sstevel@tonic-gate * in kernel under on_trap() or t_lofault
3001280Srf157361 * protection. If so, and if it was a PIO or MEM
3011280Srf157361 * error, then modify the saved registers so that
3021280Srf157361 * we return from the trap to the appropriate
3031280Srf157361 * trampoline routine.
3040Sstevel@tonic-gate */
3051280Srf157361 if (aflt->flt_priv == 1 && aflt->flt_tl == 0 &&
3061280Srf157361 ((errh_flt.errh_er.attr & ERRH_ATTR_PIO) ||
3071280Srf157361 (errh_flt.errh_er.attr & ERRH_ATTR_MEM))) {
3080Sstevel@tonic-gate trampolined =
3090Sstevel@tonic-gate errh_error_protected(rp, aflt, &expected);
3101280Srf157361 }
3110Sstevel@tonic-gate
3120Sstevel@tonic-gate if (!aflt->flt_priv || aflt->flt_prot ==
3130Sstevel@tonic-gate AFLT_PROT_COPY) {
3140Sstevel@tonic-gate aflt->flt_panic |= aft_panic;
3150Sstevel@tonic-gate } else if (!trampolined &&
3162477Srf157361 (aflt->flt_class != BUS_FAULT)) {
3170Sstevel@tonic-gate aflt->flt_panic = 1;
3180Sstevel@tonic-gate }
3190Sstevel@tonic-gate
3200Sstevel@tonic-gate /*
3213156Sgirish * Check error attribute, handle individual error
3223156Sgirish * if it is needed.
3233156Sgirish */
3243156Sgirish errh_handle_attr(&errh_flt);
3253156Sgirish
3263156Sgirish /*
3270Sstevel@tonic-gate * If PIO error, we need to query the bus nexus
3280Sstevel@tonic-gate * for fatal errors.
3290Sstevel@tonic-gate */
3300Sstevel@tonic-gate if (aflt->flt_class == BUS_FAULT) {
33110271SJason.Beloro@Sun.COM aflt->flt_addr = errh_flt.errh_er.ra;
3320Sstevel@tonic-gate errh_cpu_run_bus_error_handlers(aflt,
3330Sstevel@tonic-gate expected);
3340Sstevel@tonic-gate }
3350Sstevel@tonic-gate
3360Sstevel@tonic-gate break;
3370Sstevel@tonic-gate
3383313Siskreen case ERRH_DESC_USER_DCORE:
3393313Siskreen /*
3403313Siskreen * User generated panic. Call panic directly
3413313Siskreen * since there are no FMA e-reports to
3423313Siskreen * display.
3433313Siskreen */
3443313Siskreen
3453313Siskreen panic("Panic - Generated at user request");
3463313Siskreen
3473313Siskreen break;
3483313Siskreen
3490Sstevel@tonic-gate default:
3502218Swh94709 cmn_err(CE_WARN, "Panic - Error Descriptor 0x%llx "
3512218Swh94709 " invalid in non-resumable error handler",
3520Sstevel@tonic-gate (long long) errh_flt.errh_er.desc);
3532218Swh94709 aflt->flt_panic = 1;
3542218Swh94709 break;
3550Sstevel@tonic-gate }
3560Sstevel@tonic-gate
3570Sstevel@tonic-gate /*
3580Sstevel@tonic-gate * Queue the error report for further processing. If
3590Sstevel@tonic-gate * flt_panic is set, code still process other errors
3600Sstevel@tonic-gate * in the queue until the panic routine stops the
3610Sstevel@tonic-gate * kernel.
3620Sstevel@tonic-gate */
3630Sstevel@tonic-gate (void) cpu_queue_one_event(&errh_flt);
3640Sstevel@tonic-gate
3650Sstevel@tonic-gate /*
3660Sstevel@tonic-gate * Panic here if aflt->flt_panic has been set.
3670Sstevel@tonic-gate * Enqueued errors will be logged as part of the panic flow.
3680Sstevel@tonic-gate */
3690Sstevel@tonic-gate if (aflt->flt_panic) {
3700Sstevel@tonic-gate fm_panic("Unrecoverable hardware error");
3710Sstevel@tonic-gate }
3720Sstevel@tonic-gate
3730Sstevel@tonic-gate /*
374917Selowe * Call page_retire() to handle memory errors.
3750Sstevel@tonic-gate */
3760Sstevel@tonic-gate if (errh_flt.errh_er.attr & ERRH_ATTR_MEM)
377917Selowe errh_page_retire(&errh_flt, PR_UE);
3780Sstevel@tonic-gate
3790Sstevel@tonic-gate /*
38010271SJason.Beloro@Sun.COM * If we queued an error and the it was in user mode, or
38110271SJason.Beloro@Sun.COM * protected by t_lofault, or user_spill_fill is set, we
38210271SJason.Beloro@Sun.COM * set AST flag so the queue will be drained before
38310271SJason.Beloro@Sun.COM * returning to user mode.
3840Sstevel@tonic-gate */
38510271SJason.Beloro@Sun.COM if (!aflt->flt_priv || aflt->flt_prot == AFLT_PROT_COPY ||
38610271SJason.Beloro@Sun.COM u_spill_fill) {
3870Sstevel@tonic-gate int pcb_flag = 0;
3880Sstevel@tonic-gate
3890Sstevel@tonic-gate if (aflt->flt_class == CPU_FAULT)
3900Sstevel@tonic-gate pcb_flag |= ASYNC_HWERR;
3910Sstevel@tonic-gate else if (aflt->flt_class == BUS_FAULT)
3920Sstevel@tonic-gate pcb_flag |= ASYNC_BERR;
3930Sstevel@tonic-gate
3940Sstevel@tonic-gate ttolwp(curthread)->lwp_pcb.pcb_flags |= pcb_flag;
3950Sstevel@tonic-gate aston(curthread);
3960Sstevel@tonic-gate }
3970Sstevel@tonic-gate }
3980Sstevel@tonic-gate }
3990Sstevel@tonic-gate
4000Sstevel@tonic-gate /*
4010Sstevel@tonic-gate * For PIO errors, this routine calls nexus driver's error
4020Sstevel@tonic-gate * callback routines. If the callback routine returns fatal, and
4030Sstevel@tonic-gate * we are in kernel or unknow mode without any error protection,
4040Sstevel@tonic-gate * we need to turn on the panic flag.
4050Sstevel@tonic-gate */
4060Sstevel@tonic-gate void
errh_cpu_run_bus_error_handlers(struct async_flt * aflt,int expected)4070Sstevel@tonic-gate errh_cpu_run_bus_error_handlers(struct async_flt *aflt, int expected)
4080Sstevel@tonic-gate {
4090Sstevel@tonic-gate int status;
4100Sstevel@tonic-gate ddi_fm_error_t de;
4110Sstevel@tonic-gate
4120Sstevel@tonic-gate bzero(&de, sizeof (ddi_fm_error_t));
4130Sstevel@tonic-gate
4140Sstevel@tonic-gate de.fme_version = DDI_FME_VERSION;
4150Sstevel@tonic-gate de.fme_ena = fm_ena_generate(aflt->flt_id, FM_ENA_FMT1);
4160Sstevel@tonic-gate de.fme_flag = expected;
4170Sstevel@tonic-gate de.fme_bus_specific = (void *)aflt->flt_addr;
4180Sstevel@tonic-gate status = ndi_fm_handler_dispatch(ddi_root_node(), NULL, &de);
4190Sstevel@tonic-gate
4200Sstevel@tonic-gate /*
4210Sstevel@tonic-gate * If error is protected, it will jump to proper routine
4220Sstevel@tonic-gate * to handle the handle; if it is in user level, we just
4230Sstevel@tonic-gate * kill the user process; if the driver thinks the error is
4240Sstevel@tonic-gate * not fatal, we can drive on. If none of above are true,
4250Sstevel@tonic-gate * we panic
4260Sstevel@tonic-gate */
4270Sstevel@tonic-gate if ((aflt->flt_prot == AFLT_PROT_NONE) && (aflt->flt_priv == 1) &&
4280Sstevel@tonic-gate (status == DDI_FM_FATAL))
4290Sstevel@tonic-gate aflt->flt_panic = 1;
4300Sstevel@tonic-gate }
4310Sstevel@tonic-gate
4320Sstevel@tonic-gate /*
4330Sstevel@tonic-gate * This routine checks to see if we are under any error protection when
4340Sstevel@tonic-gate * the error happens. If we are under error protection, we unwind to
4350Sstevel@tonic-gate * the protection and indicate fault.
4360Sstevel@tonic-gate */
4370Sstevel@tonic-gate static int
errh_error_protected(struct regs * rp,struct async_flt * aflt,int * expected)4380Sstevel@tonic-gate errh_error_protected(struct regs *rp, struct async_flt *aflt, int *expected)
4390Sstevel@tonic-gate {
4400Sstevel@tonic-gate int trampolined = 0;
4410Sstevel@tonic-gate ddi_acc_hdl_t *hp;
4420Sstevel@tonic-gate
4430Sstevel@tonic-gate if (curthread->t_ontrap != NULL) {
4440Sstevel@tonic-gate on_trap_data_t *otp = curthread->t_ontrap;
4450Sstevel@tonic-gate
4460Sstevel@tonic-gate if (otp->ot_prot & OT_DATA_EC) {
4470Sstevel@tonic-gate aflt->flt_prot = AFLT_PROT_EC;
4480Sstevel@tonic-gate otp->ot_trap |= OT_DATA_EC;
4490Sstevel@tonic-gate rp->r_pc = otp->ot_trampoline;
4500Sstevel@tonic-gate rp->r_npc = rp->r_pc +4;
4510Sstevel@tonic-gate trampolined = 1;
4520Sstevel@tonic-gate }
4530Sstevel@tonic-gate
4540Sstevel@tonic-gate if (otp->ot_prot & OT_DATA_ACCESS) {
4550Sstevel@tonic-gate aflt->flt_prot = AFLT_PROT_ACCESS;
4560Sstevel@tonic-gate otp->ot_trap |= OT_DATA_ACCESS;
4570Sstevel@tonic-gate rp->r_pc = otp->ot_trampoline;
4580Sstevel@tonic-gate rp->r_npc = rp->r_pc + 4;
4590Sstevel@tonic-gate trampolined = 1;
4600Sstevel@tonic-gate /*
4610Sstevel@tonic-gate * for peek and caut_gets
4620Sstevel@tonic-gate * errors are expected
4630Sstevel@tonic-gate */
4640Sstevel@tonic-gate hp = (ddi_acc_hdl_t *)otp->ot_handle;
4650Sstevel@tonic-gate if (!hp)
4660Sstevel@tonic-gate *expected = DDI_FM_ERR_PEEK;
4670Sstevel@tonic-gate else if (hp->ah_acc.devacc_attr_access ==
4680Sstevel@tonic-gate DDI_CAUTIOUS_ACC)
4690Sstevel@tonic-gate *expected = DDI_FM_ERR_EXPECTED;
4700Sstevel@tonic-gate }
4710Sstevel@tonic-gate } else if (curthread->t_lofault) {
4720Sstevel@tonic-gate aflt->flt_prot = AFLT_PROT_COPY;
4730Sstevel@tonic-gate rp->r_g1 = EFAULT;
4740Sstevel@tonic-gate rp->r_pc = curthread->t_lofault;
4750Sstevel@tonic-gate rp->r_npc = rp->r_pc + 4;
4760Sstevel@tonic-gate trampolined = 1;
4770Sstevel@tonic-gate }
4780Sstevel@tonic-gate
4790Sstevel@tonic-gate return (trampolined);
4800Sstevel@tonic-gate }
4810Sstevel@tonic-gate
4820Sstevel@tonic-gate /*
4830Sstevel@tonic-gate * Queue one event.
4840Sstevel@tonic-gate */
4850Sstevel@tonic-gate static void
cpu_queue_one_event(errh_async_flt_t * errh_fltp)4860Sstevel@tonic-gate cpu_queue_one_event(errh_async_flt_t *errh_fltp)
4870Sstevel@tonic-gate {
4880Sstevel@tonic-gate struct async_flt *aflt = (struct async_flt *)errh_fltp;
4890Sstevel@tonic-gate errorq_t *eqp;
4900Sstevel@tonic-gate
4910Sstevel@tonic-gate if (aflt->flt_panic)
4920Sstevel@tonic-gate eqp = ue_queue;
4930Sstevel@tonic-gate else
4940Sstevel@tonic-gate eqp = ce_queue;
4950Sstevel@tonic-gate
4960Sstevel@tonic-gate errorq_dispatch(eqp, errh_fltp, sizeof (errh_async_flt_t),
4970Sstevel@tonic-gate aflt->flt_panic);
4980Sstevel@tonic-gate }
4990Sstevel@tonic-gate
5000Sstevel@tonic-gate /*
5010Sstevel@tonic-gate * The cpu_async_log_err() function is called by the ce/ue_drain() function to
5020Sstevel@tonic-gate * handle logging for CPU events that are dequeued. As such, it can be invoked
5030Sstevel@tonic-gate * from softint context, from AST processing in the trap() flow, or from the
5040Sstevel@tonic-gate * panic flow. We decode the CPU-specific data, and log appropriate messages.
5050Sstevel@tonic-gate */
5060Sstevel@tonic-gate void
cpu_async_log_err(void * flt)5070Sstevel@tonic-gate cpu_async_log_err(void *flt)
5080Sstevel@tonic-gate {
5090Sstevel@tonic-gate errh_async_flt_t *errh_fltp = (errh_async_flt_t *)flt;
5100Sstevel@tonic-gate errh_er_t *errh_erp = (errh_er_t *)&errh_fltp->errh_er;
5110Sstevel@tonic-gate
5120Sstevel@tonic-gate switch (errh_erp->desc) {
5130Sstevel@tonic-gate case ERRH_DESC_UCOR_RE:
5140Sstevel@tonic-gate if (errh_erp->attr & ERRH_ATTR_MEM) {
5150Sstevel@tonic-gate /*
516917Selowe * Turn on the PR_UE flag. The page will be
5170Sstevel@tonic-gate * scrubbed when it is freed.
5180Sstevel@tonic-gate */
519917Selowe errh_page_retire(errh_fltp, PR_UE);
5200Sstevel@tonic-gate }
5210Sstevel@tonic-gate
5220Sstevel@tonic-gate break;
5230Sstevel@tonic-gate
5240Sstevel@tonic-gate case ERRH_DESC_PR_NRE:
5250Sstevel@tonic-gate case ERRH_DESC_DEF_NRE:
5260Sstevel@tonic-gate if (errh_erp->attr & ERRH_ATTR_MEM) {
5270Sstevel@tonic-gate /*
5280Sstevel@tonic-gate * For non-resumable memory error, retire
5290Sstevel@tonic-gate * the page here.
5300Sstevel@tonic-gate */
531917Selowe errh_page_retire(errh_fltp, PR_UE);
532639Swh94709
533639Swh94709 /*
534639Swh94709 * If we are going to panic, scrub the page first
535639Swh94709 */
536639Swh94709 if (errh_fltp->cmn_asyncflt.flt_panic)
53710271SJason.Beloro@Sun.COM mem_scrub(errh_fltp->errh_er.ra,
538639Swh94709 errh_fltp->errh_er.sz);
5390Sstevel@tonic-gate }
5400Sstevel@tonic-gate break;
5410Sstevel@tonic-gate
5420Sstevel@tonic-gate default:
5430Sstevel@tonic-gate break;
5440Sstevel@tonic-gate }
5450Sstevel@tonic-gate }
5460Sstevel@tonic-gate
5470Sstevel@tonic-gate /*
5480Sstevel@tonic-gate * Called from ce_drain().
5490Sstevel@tonic-gate */
5500Sstevel@tonic-gate void
cpu_ce_log_err(struct async_flt * aflt)5510Sstevel@tonic-gate cpu_ce_log_err(struct async_flt *aflt)
5520Sstevel@tonic-gate {
5530Sstevel@tonic-gate switch (aflt->flt_class) {
5540Sstevel@tonic-gate case CPU_FAULT:
5550Sstevel@tonic-gate cpu_async_log_err(aflt);
5560Sstevel@tonic-gate break;
5570Sstevel@tonic-gate
5580Sstevel@tonic-gate case BUS_FAULT:
5590Sstevel@tonic-gate cpu_async_log_err(aflt);
5600Sstevel@tonic-gate break;
5610Sstevel@tonic-gate
5620Sstevel@tonic-gate default:
5630Sstevel@tonic-gate break;
5640Sstevel@tonic-gate }
5650Sstevel@tonic-gate }
5660Sstevel@tonic-gate
5670Sstevel@tonic-gate /*
5680Sstevel@tonic-gate * Called from ue_drain().
5690Sstevel@tonic-gate */
5700Sstevel@tonic-gate void
cpu_ue_log_err(struct async_flt * aflt)5710Sstevel@tonic-gate cpu_ue_log_err(struct async_flt *aflt)
5720Sstevel@tonic-gate {
5730Sstevel@tonic-gate switch (aflt->flt_class) {
5740Sstevel@tonic-gate case CPU_FAULT:
5750Sstevel@tonic-gate cpu_async_log_err(aflt);
5760Sstevel@tonic-gate break;
5770Sstevel@tonic-gate
5780Sstevel@tonic-gate case BUS_FAULT:
5790Sstevel@tonic-gate cpu_async_log_err(aflt);
5800Sstevel@tonic-gate break;
5810Sstevel@tonic-gate
5820Sstevel@tonic-gate default:
5830Sstevel@tonic-gate break;
5840Sstevel@tonic-gate }
5850Sstevel@tonic-gate }
5860Sstevel@tonic-gate
5870Sstevel@tonic-gate /*
5880Sstevel@tonic-gate * Turn on flag on the error memory region.
5890Sstevel@tonic-gate */
5900Sstevel@tonic-gate static void
errh_page_retire(errh_async_flt_t * errh_fltp,uchar_t flag)591917Selowe errh_page_retire(errh_async_flt_t *errh_fltp, uchar_t flag)
5920Sstevel@tonic-gate {
59310271SJason.Beloro@Sun.COM uint64_t flt_real_addr_start = errh_fltp->errh_er.ra;
5940Sstevel@tonic-gate uint64_t flt_real_addr_end = flt_real_addr_start +
5950Sstevel@tonic-gate errh_fltp->errh_er.sz - 1;
5960Sstevel@tonic-gate int64_t current_addr;
5970Sstevel@tonic-gate
5980Sstevel@tonic-gate if (errh_fltp->errh_er.sz == 0)
5990Sstevel@tonic-gate return;
6000Sstevel@tonic-gate
6010Sstevel@tonic-gate for (current_addr = flt_real_addr_start;
6020Sstevel@tonic-gate current_addr < flt_real_addr_end; current_addr += MMU_PAGESIZE) {
603917Selowe (void) page_retire(current_addr, flag);
6040Sstevel@tonic-gate }
6050Sstevel@tonic-gate }
6060Sstevel@tonic-gate
6070Sstevel@tonic-gate void
mem_scrub(uint64_t paddr,uint64_t len)6080Sstevel@tonic-gate mem_scrub(uint64_t paddr, uint64_t len)
6090Sstevel@tonic-gate {
6100Sstevel@tonic-gate uint64_t pa, length, scrubbed_len;
6110Sstevel@tonic-gate
6120Sstevel@tonic-gate pa = paddr;
6130Sstevel@tonic-gate length = len;
6140Sstevel@tonic-gate scrubbed_len = 0;
6150Sstevel@tonic-gate
616639Swh94709 while (length > 0) {
617639Swh94709 if (hv_mem_scrub(pa, length, &scrubbed_len) != H_EOK)
6180Sstevel@tonic-gate break;
6190Sstevel@tonic-gate
6200Sstevel@tonic-gate pa += scrubbed_len;
6210Sstevel@tonic-gate length -= scrubbed_len;
6220Sstevel@tonic-gate }
6230Sstevel@tonic-gate }
6240Sstevel@tonic-gate
6251457Swh94709 /*
6263199Sep32863 * Call hypervisor to flush the memory region.
6273199Sep32863 * Both va and len must be MMU_PAGESIZE aligned.
6283199Sep32863 * Returns the total number of bytes flushed.
6291457Swh94709 */
6303199Sep32863 uint64_t
mem_sync(caddr_t orig_va,size_t orig_len)6313729Swh94709 mem_sync(caddr_t orig_va, size_t orig_len)
6320Sstevel@tonic-gate {
6330Sstevel@tonic-gate uint64_t pa, length, flushed;
6343199Sep32863 uint64_t chunk_len = MMU_PAGESIZE;
6353199Sep32863 uint64_t total_flushed = 0;
6363729Swh94709 uint64_t va, len;
6370Sstevel@tonic-gate
6383729Swh94709 if (orig_len == 0)
6393199Sep32863 return (total_flushed);
6400Sstevel@tonic-gate
6413729Swh94709 /* align va */
6423729Swh94709 va = P2ALIGN_TYPED(orig_va, MMU_PAGESIZE, uint64_t);
6433729Swh94709 /* round up len to MMU_PAGESIZE aligned */
6443729Swh94709 len = P2ROUNDUP_TYPED(orig_va + orig_len, MMU_PAGESIZE, uint64_t) - va;
6453729Swh94709
6463199Sep32863 while (len > 0) {
6473199Sep32863 pa = va_to_pa((caddr_t)va);
6483199Sep32863 if (pa == (uint64_t)-1)
6493199Sep32863 return (total_flushed);
6500Sstevel@tonic-gate
6513199Sep32863 length = chunk_len;
6523199Sep32863 flushed = 0;
6531457Swh94709
6543199Sep32863 while (length > 0) {
6553199Sep32863 if (hv_mem_sync(pa, length, &flushed) != H_EOK)
6563199Sep32863 return (total_flushed);
6570Sstevel@tonic-gate
6583199Sep32863 pa += flushed;
6593199Sep32863 length -= flushed;
6603199Sep32863 total_flushed += flushed;
6613199Sep32863 }
6620Sstevel@tonic-gate
6633199Sep32863 va += chunk_len;
6643199Sep32863 len -= chunk_len;
6650Sstevel@tonic-gate }
6663199Sep32863
6673199Sep32863 return (total_flushed);
6680Sstevel@tonic-gate }
6690Sstevel@tonic-gate
6700Sstevel@tonic-gate /*
6710Sstevel@tonic-gate * If resumable queue is full, we need to check if any cpu is in
6720Sstevel@tonic-gate * error state. If not, we drive on. If yes, we need to panic. The
6730Sstevel@tonic-gate * hypervisor call hv_cpu_state() is being used for checking the
6743750Srf157361 * cpu state. And reset %tick_compr in case tick-compare was lost.
6750Sstevel@tonic-gate */
6760Sstevel@tonic-gate static void
errh_rq_full(struct async_flt * afltp)6770Sstevel@tonic-gate errh_rq_full(struct async_flt *afltp)
6780Sstevel@tonic-gate {
6790Sstevel@tonic-gate processorid_t who;
6800Sstevel@tonic-gate uint64_t cpu_state;
6810Sstevel@tonic-gate uint64_t retval;
6823750Srf157361 uint64_t current_tick;
6833750Srf157361
6843750Srf157361 current_tick = (uint64_t)gettick();
6853750Srf157361 tickcmpr_set(current_tick);
6860Sstevel@tonic-gate
6870Sstevel@tonic-gate for (who = 0; who < NCPU; who++)
6880Sstevel@tonic-gate if (CPU_IN_SET(cpu_ready_set, who)) {
6890Sstevel@tonic-gate retval = hv_cpu_state(who, &cpu_state);
6900Sstevel@tonic-gate if (retval != H_EOK || cpu_state == CPU_STATE_ERROR) {
6910Sstevel@tonic-gate afltp->flt_panic = 1;
6920Sstevel@tonic-gate break;
6930Sstevel@tonic-gate }
6940Sstevel@tonic-gate }
6950Sstevel@tonic-gate }
6960Sstevel@tonic-gate
6970Sstevel@tonic-gate /*
6980Sstevel@tonic-gate * Return processor specific async error structure
6990Sstevel@tonic-gate * size used.
7000Sstevel@tonic-gate */
7010Sstevel@tonic-gate int
cpu_aflt_size(void)7020Sstevel@tonic-gate cpu_aflt_size(void)
7030Sstevel@tonic-gate {
7040Sstevel@tonic-gate return (sizeof (errh_async_flt_t));
7050Sstevel@tonic-gate }
7060Sstevel@tonic-gate
7070Sstevel@tonic-gate #define SZ_TO_ETRS_SHIFT 6
7080Sstevel@tonic-gate
7090Sstevel@tonic-gate /*
7100Sstevel@tonic-gate * Message print out when resumable queue is overflown
7110Sstevel@tonic-gate */
7120Sstevel@tonic-gate /*ARGSUSED*/
7130Sstevel@tonic-gate void
rq_overflow(struct regs * rp,uint64_t head_offset,uint64_t tail_offset)7140Sstevel@tonic-gate rq_overflow(struct regs *rp, uint64_t head_offset,
7150Sstevel@tonic-gate uint64_t tail_offset)
7160Sstevel@tonic-gate {
7170Sstevel@tonic-gate rq_overflow_count++;
7180Sstevel@tonic-gate }
7190Sstevel@tonic-gate
7200Sstevel@tonic-gate /*
7210Sstevel@tonic-gate * Handler to process a fatal error. This routine can be called from a
7220Sstevel@tonic-gate * softint, called from trap()'s AST handling, or called from the panic flow.
7230Sstevel@tonic-gate */
7240Sstevel@tonic-gate /*ARGSUSED*/
7250Sstevel@tonic-gate static void
ue_drain(void * ignored,struct async_flt * aflt,errorq_elem_t * eqep)7260Sstevel@tonic-gate ue_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep)
7270Sstevel@tonic-gate {
7280Sstevel@tonic-gate cpu_ue_log_err(aflt);
7290Sstevel@tonic-gate }
7300Sstevel@tonic-gate
7310Sstevel@tonic-gate /*
7320Sstevel@tonic-gate * Handler to process a correctable error. This routine can be called from a
7330Sstevel@tonic-gate * softint. We just call the CPU module's logging routine.
7340Sstevel@tonic-gate */
7350Sstevel@tonic-gate /*ARGSUSED*/
7360Sstevel@tonic-gate static void
ce_drain(void * ignored,struct async_flt * aflt,errorq_elem_t * eqep)7370Sstevel@tonic-gate ce_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep)
7380Sstevel@tonic-gate {
7390Sstevel@tonic-gate cpu_ce_log_err(aflt);
7400Sstevel@tonic-gate }
7410Sstevel@tonic-gate
7420Sstevel@tonic-gate /*
74311354SAnthony.Yznaga@Sun.COM * Handler to process a sun4v errort report via an errorq_t. This routine
74411354SAnthony.Yznaga@Sun.COM * can be called from a softint.
74511354SAnthony.Yznaga@Sun.COM *
74611354SAnthony.Yznaga@Sun.COM * This is used for sun4v error reports that cannot be processed at high-level
74711354SAnthony.Yznaga@Sun.COM * interrupt time. Currently only error reports indicating an SP state change
74811354SAnthony.Yznaga@Sun.COM * are handled in this manner.
74911354SAnthony.Yznaga@Sun.COM */
75011354SAnthony.Yznaga@Sun.COM /*ARGSUSED*/
75111354SAnthony.Yznaga@Sun.COM static void
errh_drain(void * ignored,errh_er_t * errh_erp,errorq_elem_t * eqep)75211354SAnthony.Yznaga@Sun.COM errh_drain(void *ignored, errh_er_t *errh_erp, errorq_elem_t *eqep)
75311354SAnthony.Yznaga@Sun.COM {
75411354SAnthony.Yznaga@Sun.COM ASSERT(errh_erp->desc == ERRH_DESC_SP);
75511354SAnthony.Yznaga@Sun.COM
75611354SAnthony.Yznaga@Sun.COM errh_handle_sp(errh_erp);
75711354SAnthony.Yznaga@Sun.COM }
75811354SAnthony.Yznaga@Sun.COM
75911354SAnthony.Yznaga@Sun.COM /*
760541Srf157361 * Handler to process vbsc hostshutdown (power-off button).
761541Srf157361 */
762541Srf157361 static int
err_shutdown_softintr()763541Srf157361 err_shutdown_softintr()
764541Srf157361 {
765541Srf157361 cmn_err(CE_WARN, "Power-off requested, system will now shutdown.");
766541Srf157361 do_shutdown();
767541Srf157361
768541Srf157361 /*
769541Srf157361 * just in case do_shutdown() fails
770541Srf157361 */
771541Srf157361 (void) timeout((void(*)(void *))power_down, NULL, 100 * hz);
772541Srf157361 return (DDI_INTR_CLAIMED);
773541Srf157361 }
774541Srf157361
775541Srf157361 /*
7760Sstevel@tonic-gate * Allocate error queue sizes based on max_ncpus. max_ncpus is set just
7770Sstevel@tonic-gate * after ncpunode has been determined. ncpus is set in start_other_cpus
7780Sstevel@tonic-gate * which is called after error_init() but may change dynamically.
7790Sstevel@tonic-gate */
7800Sstevel@tonic-gate void
error_init(void)7810Sstevel@tonic-gate error_init(void)
7820Sstevel@tonic-gate {
7830Sstevel@tonic-gate char tmp_name[MAXSYSNAME];
784789Sahrens pnode_t node;
7850Sstevel@tonic-gate size_t size = cpu_aflt_size();
7860Sstevel@tonic-gate
7870Sstevel@tonic-gate /*
7880Sstevel@tonic-gate * Initialize the correctable and uncorrectable error queues.
7890Sstevel@tonic-gate */
7900Sstevel@tonic-gate ue_queue = errorq_create("ue_queue", (errorq_func_t)ue_drain, NULL,
7910Sstevel@tonic-gate MAX_ASYNC_FLTS * (max_ncpus + 1), size, PIL_2, ERRORQ_VITAL);
7920Sstevel@tonic-gate
7930Sstevel@tonic-gate ce_queue = errorq_create("ce_queue", (errorq_func_t)ce_drain, NULL,
7940Sstevel@tonic-gate MAX_CE_FLTS * (max_ncpus + 1), size, PIL_1, 0);
7950Sstevel@tonic-gate
79611354SAnthony.Yznaga@Sun.COM errh_queue = errorq_create("errh_queue", (errorq_func_t)errh_drain,
79711354SAnthony.Yznaga@Sun.COM NULL, CPU_RQ_ENTRIES, sizeof (errh_er_t), PIL_1, 0);
79811354SAnthony.Yznaga@Sun.COM
79911354SAnthony.Yznaga@Sun.COM if (ue_queue == NULL || ce_queue == NULL || errh_queue == NULL)
8000Sstevel@tonic-gate panic("failed to create required system error queue");
8010Sstevel@tonic-gate
8020Sstevel@tonic-gate /*
803541Srf157361 * Setup interrupt handler for power-off button.
804541Srf157361 */
805541Srf157361 err_shutdown_inum = add_softintr(PIL_9,
8062973Sgovinda (softintrfunc)err_shutdown_softintr, NULL, SOFTINT_ST);
807541Srf157361
808541Srf157361 /*
8090Sstevel@tonic-gate * Initialize the busfunc list mutex. This must be a PIL_15 spin lock
8100Sstevel@tonic-gate * because we will need to acquire it from cpu_async_error().
8110Sstevel@tonic-gate */
8120Sstevel@tonic-gate mutex_init(&bfd_lock, NULL, MUTEX_SPIN, (void *)PIL_15);
8130Sstevel@tonic-gate
8144612Srf157361 /* Only allow one cpu at a time to dump errh errors. */
8154612Srf157361 mutex_init(&errh_print_lock, NULL, MUTEX_SPIN, (void *)PIL_15);
8164612Srf157361
8170Sstevel@tonic-gate node = prom_rootnode();
8180Sstevel@tonic-gate if ((node == OBP_NONODE) || (node == OBP_BADNODE)) {
8190Sstevel@tonic-gate cmn_err(CE_CONT, "error_init: node 0x%x\n", (uint_t)node);
8200Sstevel@tonic-gate return;
8210Sstevel@tonic-gate }
8220Sstevel@tonic-gate
8230Sstevel@tonic-gate if (((size = prom_getproplen(node, "reset-reason")) != -1) &&
8240Sstevel@tonic-gate (size <= MAXSYSNAME) &&
8250Sstevel@tonic-gate (prom_getprop(node, "reset-reason", tmp_name) != -1)) {
8260Sstevel@tonic-gate if (reset_debug) {
8270Sstevel@tonic-gate cmn_err(CE_CONT, "System booting after %s\n", tmp_name);
8280Sstevel@tonic-gate } else if (strncmp(tmp_name, "FATAL", 5) == 0) {
8290Sstevel@tonic-gate cmn_err(CE_CONT,
8300Sstevel@tonic-gate "System booting after fatal error %s\n", tmp_name);
8310Sstevel@tonic-gate }
8320Sstevel@tonic-gate }
8330Sstevel@tonic-gate }
834817Swh94709
835817Swh94709 /*
836817Swh94709 * Nonresumable queue is full, panic here
837817Swh94709 */
838817Swh94709 /*ARGSUSED*/
839817Swh94709 void
nrq_overflow(struct regs * rp)840817Swh94709 nrq_overflow(struct regs *rp)
841817Swh94709 {
842817Swh94709 fm_panic("Nonresumable queue full");
843817Swh94709 }
8443156Sgirish
8453156Sgirish /*
8463156Sgirish * This is the place for special error handling for individual errors.
8473156Sgirish */
8483156Sgirish static void
errh_handle_attr(errh_async_flt_t * errh_fltp)8493156Sgirish errh_handle_attr(errh_async_flt_t *errh_fltp)
8503156Sgirish {
8513156Sgirish switch (errh_fltp->errh_er.attr & ~ERRH_MODE_MASK) {
8523156Sgirish case ERRH_ATTR_CPU:
8533156Sgirish case ERRH_ATTR_MEM:
8543156Sgirish case ERRH_ATTR_PIO:
8553156Sgirish case ERRH_ATTR_IRF:
8563156Sgirish case ERRH_ATTR_FRF:
8573156Sgirish case ERRH_ATTR_SHUT:
8583156Sgirish break;
8593156Sgirish
8603156Sgirish case ERRH_ATTR_ASR:
8613156Sgirish errh_handle_asr(errh_fltp);
8623156Sgirish break;
8633156Sgirish
8643156Sgirish case ERRH_ATTR_ASI:
8653156Sgirish case ERRH_ATTR_PREG:
8663156Sgirish case ERRH_ATTR_RQF:
8673156Sgirish break;
8683156Sgirish
8693156Sgirish default:
8703156Sgirish break;
8713156Sgirish }
8723156Sgirish }
8733156Sgirish
8743156Sgirish /*
8753156Sgirish * Handle ASR bit set in ATTR
8763156Sgirish */
8773156Sgirish static void
errh_handle_asr(errh_async_flt_t * errh_fltp)8783156Sgirish errh_handle_asr(errh_async_flt_t *errh_fltp)
8793156Sgirish {
8803156Sgirish uint64_t current_tick;
8813156Sgirish
8823156Sgirish switch (errh_fltp->errh_er.reg) {
8833156Sgirish case ASR_REG_VALID | ASR_REG_TICK:
8843156Sgirish /*
8853156Sgirish * For Tick Compare Register error, it only happens when
8863156Sgirish * the register is being read or compared with the %tick
8873156Sgirish * register. Since we lost the contents of the register,
8883156Sgirish * we set the %tick_compr in the future. An interrupt will
8893156Sgirish * happen when %tick matches the value field of %tick_compr.
8903156Sgirish */
8913156Sgirish current_tick = (uint64_t)gettick();
8923156Sgirish tickcmpr_set(current_tick);
8933156Sgirish /* Do not panic */
8943156Sgirish errh_fltp->cmn_asyncflt.flt_panic = 0;
8953156Sgirish break;
8963156Sgirish
8973156Sgirish default:
8983156Sgirish break;
8993156Sgirish }
9003156Sgirish }
9014612Srf157361
9024612Srf157361 /*
90311304SJanie.Lu@Sun.COM * Handle a SP state change.
90411304SJanie.Lu@Sun.COM */
90511304SJanie.Lu@Sun.COM static void
errh_handle_sp(errh_er_t * errh_erp)90611354SAnthony.Yznaga@Sun.COM errh_handle_sp(errh_er_t *errh_erp)
90711304SJanie.Lu@Sun.COM {
90811304SJanie.Lu@Sun.COM uint8_t sp_state;
90911304SJanie.Lu@Sun.COM
91011354SAnthony.Yznaga@Sun.COM sp_state = (errh_erp->attr & ERRH_SP_MASK) >> ERRH_SP_SHIFT;
91111304SJanie.Lu@Sun.COM
912*11759SAnthony.Yznaga@Sun.COM sp_ereport_post(sp_state);
91311304SJanie.Lu@Sun.COM }
91411304SJanie.Lu@Sun.COM
91511304SJanie.Lu@Sun.COM /*
9164612Srf157361 * Dump the error packet
9174612Srf157361 */
9184612Srf157361 /*ARGSUSED*/
9194612Srf157361 static void
errh_er_print(errh_er_t * errh_erp,const char * queue)9204612Srf157361 errh_er_print(errh_er_t *errh_erp, const char *queue)
9214612Srf157361 {
9224612Srf157361 typedef union {
9234612Srf157361 uint64_t w;
9244612Srf157361 uint16_t s[4];
9254612Srf157361 } errhp_t;
9264612Srf157361 errhp_t *p = (errhp_t *)errh_erp;
9274612Srf157361 int i;
9284612Srf157361
9294612Srf157361 mutex_enter(&errh_print_lock);
9304612Srf157361 switch (errh_erp->desc) {
9314612Srf157361 case ERRH_DESC_UCOR_RE:
9324612Srf157361 cmn_err(CE_CONT, "\nResumable Uncorrectable Error ");
9334612Srf157361 break;
9344612Srf157361 case ERRH_DESC_PR_NRE:
9354612Srf157361 cmn_err(CE_CONT, "\nNonresumable Precise Error ");
9364612Srf157361 break;
9374612Srf157361 case ERRH_DESC_DEF_NRE:
9384612Srf157361 cmn_err(CE_CONT, "\nNonresumable Deferred Error ");
9394612Srf157361 break;
9404612Srf157361 default:
9414612Srf157361 cmn_err(CE_CONT, "\nError packet ");
9424612Srf157361 break;
9434612Srf157361 }
9444612Srf157361 cmn_err(CE_CONT, "received on %s\n", queue);
9454612Srf157361
9464612Srf157361 /*
9474612Srf157361 * Print Q_ENTRY_SIZE bytes of epacket with 8 bytes per line
9484612Srf157361 */
9494612Srf157361 for (i = Q_ENTRY_SIZE; i > 0; i -= 8, ++p) {
9504612Srf157361 cmn_err(CE_CONT, "%016lx: %04x %04x %04x %04x\n", (uint64_t)p,
9514612Srf157361 p->s[0], p->s[1], p->s[2], p->s[3]);
9524612Srf157361 }
9534612Srf157361 mutex_exit(&errh_print_lock);
9544612Srf157361 }
95511304SJanie.Lu@Sun.COM
95611304SJanie.Lu@Sun.COM static void
sp_ereport_post(uint8_t sp_state)95711304SJanie.Lu@Sun.COM sp_ereport_post(uint8_t sp_state)
95811304SJanie.Lu@Sun.COM {
95911304SJanie.Lu@Sun.COM nvlist_t *ereport, *detector;
960*11759SAnthony.Yznaga@Sun.COM char *str = NULL;
96111304SJanie.Lu@Sun.COM
962*11759SAnthony.Yznaga@Sun.COM switch (sp_state) {
963*11759SAnthony.Yznaga@Sun.COM case ERRH_SP_FAULTED:
964*11759SAnthony.Yznaga@Sun.COM str = "chassis.sp.unavailable";
965*11759SAnthony.Yznaga@Sun.COM break;
966*11759SAnthony.Yznaga@Sun.COM
967*11759SAnthony.Yznaga@Sun.COM case ERRH_SP_NOT_PRESENT:
968*11759SAnthony.Yznaga@Sun.COM /*
969*11759SAnthony.Yznaga@Sun.COM * It is expected that removal of the SP will be undertaken
970*11759SAnthony.Yznaga@Sun.COM * in response to an existing service action. Diagnosing
971*11759SAnthony.Yznaga@Sun.COM * a fault in response to notification that the SP is
972*11759SAnthony.Yznaga@Sun.COM * missing is therefore undesired. In the future the fault
973*11759SAnthony.Yznaga@Sun.COM * management architecture may be updated to support more
974*11759SAnthony.Yznaga@Sun.COM * appropriate alert events. When that happens this code
975*11759SAnthony.Yznaga@Sun.COM * should be revisited.
976*11759SAnthony.Yznaga@Sun.COM */
977*11759SAnthony.Yznaga@Sun.COM return;
978*11759SAnthony.Yznaga@Sun.COM
979*11759SAnthony.Yznaga@Sun.COM case ERRH_SP_AVAILABLE:
980*11759SAnthony.Yznaga@Sun.COM /*
981*11759SAnthony.Yznaga@Sun.COM * Hypervisor does not send an epkt for this case
982*11759SAnthony.Yznaga@Sun.COM * so this should never happen.
983*11759SAnthony.Yznaga@Sun.COM */
984*11759SAnthony.Yznaga@Sun.COM cmn_err(CE_WARN, "Received unexpected notification "
985*11759SAnthony.Yznaga@Sun.COM "that the SP is available.");
986*11759SAnthony.Yznaga@Sun.COM return;
987*11759SAnthony.Yznaga@Sun.COM
988*11759SAnthony.Yznaga@Sun.COM default:
989*11759SAnthony.Yznaga@Sun.COM cmn_err(CE_WARN, "Invalid SP state 0x%x. No ereport posted.\n",
990*11759SAnthony.Yznaga@Sun.COM sp_state);
991*11759SAnthony.Yznaga@Sun.COM return;
992*11759SAnthony.Yznaga@Sun.COM }
99311304SJanie.Lu@Sun.COM
99411304SJanie.Lu@Sun.COM ereport = fm_nvlist_create(NULL);
99511304SJanie.Lu@Sun.COM detector = fm_nvlist_create(NULL);
99611304SJanie.Lu@Sun.COM
99711304SJanie.Lu@Sun.COM /*
99811304SJanie.Lu@Sun.COM * Create an HC-scheme detector FMRI.
99911304SJanie.Lu@Sun.COM */
100011304SJanie.Lu@Sun.COM fm_fmri_hc_set(detector, FM_HC_SCHEME_VERSION, NULL, NULL, 1,
100111304SJanie.Lu@Sun.COM "chassis", 0);
100211304SJanie.Lu@Sun.COM
1003*11759SAnthony.Yznaga@Sun.COM fm_ereport_set(ereport, FM_EREPORT_VERSION, str,
100411304SJanie.Lu@Sun.COM fm_ena_generate(0, FM_ENA_FMT1), detector, NULL);
100511304SJanie.Lu@Sun.COM
100611304SJanie.Lu@Sun.COM (void) fm_ereport_post(ereport, EVCH_TRYHARD);
100711304SJanie.Lu@Sun.COM
100811304SJanie.Lu@Sun.COM fm_nvlist_destroy(ereport, FM_NVA_FREE);
100911304SJanie.Lu@Sun.COM fm_nvlist_destroy(detector, FM_NVA_FREE);
101011304SJanie.Lu@Sun.COM }
1011