1*0Sstevel@tonic-gate /* 2*0Sstevel@tonic-gate * CDDL HEADER START 3*0Sstevel@tonic-gate * 4*0Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*0Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 6*0Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 7*0Sstevel@tonic-gate * with the License. 8*0Sstevel@tonic-gate * 9*0Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*0Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 11*0Sstevel@tonic-gate * See the License for the specific language governing permissions 12*0Sstevel@tonic-gate * and limitations under the License. 13*0Sstevel@tonic-gate * 14*0Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 15*0Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*0Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 17*0Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 18*0Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 19*0Sstevel@tonic-gate * 20*0Sstevel@tonic-gate * CDDL HEADER END 21*0Sstevel@tonic-gate */ 22*0Sstevel@tonic-gate /* 23*0Sstevel@tonic-gate * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*0Sstevel@tonic-gate * Use is subject to license terms. 25*0Sstevel@tonic-gate */ 26*0Sstevel@tonic-gate 27*0Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 28*0Sstevel@tonic-gate 29*0Sstevel@tonic-gate #include <sys/types.h> 30*0Sstevel@tonic-gate #include <sys/machsystm.h> 31*0Sstevel@tonic-gate #include <sys/cpuvar.h> 32*0Sstevel@tonic-gate #include <sys/async.h> 33*0Sstevel@tonic-gate #include <sys/ontrap.h> 34*0Sstevel@tonic-gate #include <sys/ddifm.h> 35*0Sstevel@tonic-gate #include <sys/hypervisor_api.h> 36*0Sstevel@tonic-gate #include <sys/errorq.h> 37*0Sstevel@tonic-gate #include <sys/promif.h> 38*0Sstevel@tonic-gate #include <sys/prom_plat.h> 39*0Sstevel@tonic-gate #include <sys/x_call.h> 40*0Sstevel@tonic-gate #include <sys/error.h> 41*0Sstevel@tonic-gate #include <sys/fm/util.h> 42*0Sstevel@tonic-gate 43*0Sstevel@tonic-gate #define MAX_CE_FLTS 10 44*0Sstevel@tonic-gate #define MAX_ASYNC_FLTS 6 45*0Sstevel@tonic-gate 46*0Sstevel@tonic-gate errorq_t *ue_queue; /* queue of uncorrectable errors */ 47*0Sstevel@tonic-gate errorq_t *ce_queue; /* queue of correctable errors */ 48*0Sstevel@tonic-gate 49*0Sstevel@tonic-gate /* 50*0Sstevel@tonic-gate * Being used by memory test driver. 51*0Sstevel@tonic-gate * ce_verbose_memory - covers CEs in DIMMs 52*0Sstevel@tonic-gate * ce_verbose_other - covers "others" (ecache, IO, etc.) 53*0Sstevel@tonic-gate * 54*0Sstevel@tonic-gate * If the value is 0, nothing is logged. 55*0Sstevel@tonic-gate * If the value is 1, the error is logged to the log file, but not console. 56*0Sstevel@tonic-gate * If the value is 2, the error is logged to the log file and console. 57*0Sstevel@tonic-gate */ 58*0Sstevel@tonic-gate int ce_verbose_memory = 1; 59*0Sstevel@tonic-gate int ce_verbose_other = 1; 60*0Sstevel@tonic-gate 61*0Sstevel@tonic-gate int ce_show_data = 0; 62*0Sstevel@tonic-gate int ce_debug = 0; 63*0Sstevel@tonic-gate int ue_debug = 0; 64*0Sstevel@tonic-gate int reset_debug = 0; 65*0Sstevel@tonic-gate 66*0Sstevel@tonic-gate /* 67*0Sstevel@tonic-gate * Tunables for controlling the handling of asynchronous faults (AFTs). Setting 68*0Sstevel@tonic-gate * these to non-default values on a non-DEBUG kernel is NOT supported. 69*0Sstevel@tonic-gate */ 70*0Sstevel@tonic-gate int aft_verbose = 0; /* log AFT messages > 1 to log only */ 71*0Sstevel@tonic-gate int aft_panic = 0; /* panic (not reboot) on fatal usermode AFLT */ 72*0Sstevel@tonic-gate int aft_testfatal = 0; /* force all AFTs to panic immediately */ 73*0Sstevel@tonic-gate 74*0Sstevel@tonic-gate /* 75*0Sstevel@tonic-gate * Defined in bus_func.c but initialised in error_init 76*0Sstevel@tonic-gate */ 77*0Sstevel@tonic-gate extern kmutex_t bfd_lock; 78*0Sstevel@tonic-gate 79*0Sstevel@tonic-gate static uint32_t rq_overflow_count = 0; /* counter for rq overflow */ 80*0Sstevel@tonic-gate 81*0Sstevel@tonic-gate static void cpu_queue_one_event(errh_async_flt_t *); 82*0Sstevel@tonic-gate static uint32_t count_entries_on_queue(uint64_t, uint64_t, uint32_t); 83*0Sstevel@tonic-gate static void errh_page_settoxic(errh_async_flt_t *, uchar_t); 84*0Sstevel@tonic-gate static void errh_page_retire(errh_async_flt_t *); 85*0Sstevel@tonic-gate static int errh_error_protected(struct regs *, struct async_flt *, int *); 86*0Sstevel@tonic-gate static void errh_rq_full(struct async_flt *); 87*0Sstevel@tonic-gate static void ue_drain(void *, struct async_flt *, errorq_elem_t *); 88*0Sstevel@tonic-gate static void ce_drain(void *, struct async_flt *, errorq_elem_t *); 89*0Sstevel@tonic-gate 90*0Sstevel@tonic-gate /*ARGSUSED*/ 91*0Sstevel@tonic-gate void 92*0Sstevel@tonic-gate process_resumable_error(struct regs *rp, uint32_t head_offset, 93*0Sstevel@tonic-gate uint32_t tail_offset) 94*0Sstevel@tonic-gate { 95*0Sstevel@tonic-gate struct machcpu *mcpup; 96*0Sstevel@tonic-gate struct async_flt *aflt; 97*0Sstevel@tonic-gate errh_async_flt_t errh_flt; 98*0Sstevel@tonic-gate errh_er_t *head_va; 99*0Sstevel@tonic-gate 100*0Sstevel@tonic-gate mcpup = &(CPU->cpu_m); 101*0Sstevel@tonic-gate 102*0Sstevel@tonic-gate while (head_offset != tail_offset) { 103*0Sstevel@tonic-gate /* kernel buffer starts right after the resumable queue */ 104*0Sstevel@tonic-gate head_va = (errh_er_t *)(mcpup->cpu_rq_va + head_offset + 105*0Sstevel@tonic-gate CPU_RQ_SIZE); 106*0Sstevel@tonic-gate /* Copy the error report to local buffer */ 107*0Sstevel@tonic-gate bzero(&errh_flt, sizeof (errh_async_flt_t)); 108*0Sstevel@tonic-gate bcopy((char *)head_va, &(errh_flt.errh_er), 109*0Sstevel@tonic-gate sizeof (errh_er_t)); 110*0Sstevel@tonic-gate 111*0Sstevel@tonic-gate /* Increment the queue head */ 112*0Sstevel@tonic-gate head_offset += Q_ENTRY_SIZE; 113*0Sstevel@tonic-gate /* Wrap around */ 114*0Sstevel@tonic-gate head_offset &= (CPU_RQ_SIZE - 1); 115*0Sstevel@tonic-gate 116*0Sstevel@tonic-gate /* set error handle to zero so it can hold new error report */ 117*0Sstevel@tonic-gate head_va->ehdl = 0; 118*0Sstevel@tonic-gate 119*0Sstevel@tonic-gate switch (errh_flt.errh_er.desc) { 120*0Sstevel@tonic-gate case ERRH_DESC_UCOR_RE: 121*0Sstevel@tonic-gate break; 122*0Sstevel@tonic-gate 123*0Sstevel@tonic-gate default: 124*0Sstevel@tonic-gate cmn_err(CE_WARN, "Error Descriptor 0x%llx " 125*0Sstevel@tonic-gate " invalid in resumable error handler", 126*0Sstevel@tonic-gate (long long) errh_flt.errh_er.desc); 127*0Sstevel@tonic-gate continue; 128*0Sstevel@tonic-gate } 129*0Sstevel@tonic-gate 130*0Sstevel@tonic-gate aflt = (struct async_flt *)&(errh_flt.cmn_asyncflt); 131*0Sstevel@tonic-gate aflt->flt_id = gethrtime(); 132*0Sstevel@tonic-gate aflt->flt_bus_id = getprocessorid(); 133*0Sstevel@tonic-gate aflt->flt_class = CPU_FAULT; 134*0Sstevel@tonic-gate aflt->flt_prot = AFLT_PROT_NONE; 135*0Sstevel@tonic-gate aflt->flt_priv = (((errh_flt.errh_er.attr & ERRH_MODE_MASK) 136*0Sstevel@tonic-gate >> ERRH_MODE_SHIFT) == ERRH_MODE_PRIV); 137*0Sstevel@tonic-gate 138*0Sstevel@tonic-gate if (errh_flt.errh_er.attr & ERRH_ATTR_CPU) 139*0Sstevel@tonic-gate /* If it is an error on other cpu */ 140*0Sstevel@tonic-gate aflt->flt_panic = 1; 141*0Sstevel@tonic-gate else 142*0Sstevel@tonic-gate aflt->flt_panic = 0; 143*0Sstevel@tonic-gate 144*0Sstevel@tonic-gate /* 145*0Sstevel@tonic-gate * Handle resumable queue full case. 146*0Sstevel@tonic-gate */ 147*0Sstevel@tonic-gate if (errh_flt.errh_er.attr & ERRH_ATTR_RQF) { 148*0Sstevel@tonic-gate (void) errh_rq_full(aflt); 149*0Sstevel@tonic-gate } 150*0Sstevel@tonic-gate 151*0Sstevel@tonic-gate /* 152*0Sstevel@tonic-gate * Queue the error on ce or ue queue depend on flt_panic. 153*0Sstevel@tonic-gate * Even if flt_panic is set, the code still keep processing 154*0Sstevel@tonic-gate * the rest element on rq until the panic starts. 155*0Sstevel@tonic-gate */ 156*0Sstevel@tonic-gate (void) cpu_queue_one_event(&errh_flt); 157*0Sstevel@tonic-gate 158*0Sstevel@tonic-gate /* 159*0Sstevel@tonic-gate * Panic here if aflt->flt_panic has been set. 160*0Sstevel@tonic-gate * Enqueued errors will be logged as part of the panic flow. 161*0Sstevel@tonic-gate */ 162*0Sstevel@tonic-gate if (aflt->flt_panic) { 163*0Sstevel@tonic-gate fm_panic("Unrecoverable error on another CPU"); 164*0Sstevel@tonic-gate } 165*0Sstevel@tonic-gate } 166*0Sstevel@tonic-gate } 167*0Sstevel@tonic-gate 168*0Sstevel@tonic-gate void 169*0Sstevel@tonic-gate process_nonresumable_error(struct regs *rp, uint64_t tl, 170*0Sstevel@tonic-gate uint32_t head_offset, uint32_t tail_offset) 171*0Sstevel@tonic-gate { 172*0Sstevel@tonic-gate struct machcpu *mcpup; 173*0Sstevel@tonic-gate struct async_flt *aflt; 174*0Sstevel@tonic-gate errh_async_flt_t errh_flt; 175*0Sstevel@tonic-gate errh_er_t *head_va; 176*0Sstevel@tonic-gate int trampolined = 0; 177*0Sstevel@tonic-gate int expected = DDI_FM_ERR_UNEXPECTED; 178*0Sstevel@tonic-gate uint64_t exec_mode; 179*0Sstevel@tonic-gate 180*0Sstevel@tonic-gate mcpup = &(CPU->cpu_m); 181*0Sstevel@tonic-gate 182*0Sstevel@tonic-gate while (head_offset != tail_offset) { 183*0Sstevel@tonic-gate /* kernel buffer starts right after the nonresumable queue */ 184*0Sstevel@tonic-gate head_va = (errh_er_t *)(mcpup->cpu_nrq_va + head_offset + 185*0Sstevel@tonic-gate CPU_NRQ_SIZE); 186*0Sstevel@tonic-gate 187*0Sstevel@tonic-gate /* Copy the error report to local buffer */ 188*0Sstevel@tonic-gate bzero(&errh_flt, sizeof (errh_async_flt_t)); 189*0Sstevel@tonic-gate 190*0Sstevel@tonic-gate bcopy((char *)head_va, &(errh_flt.errh_er), 191*0Sstevel@tonic-gate sizeof (errh_er_t)); 192*0Sstevel@tonic-gate 193*0Sstevel@tonic-gate /* Increment the queue head */ 194*0Sstevel@tonic-gate head_offset += Q_ENTRY_SIZE; 195*0Sstevel@tonic-gate /* Wrap around */ 196*0Sstevel@tonic-gate head_offset &= (CPU_NRQ_SIZE - 1); 197*0Sstevel@tonic-gate 198*0Sstevel@tonic-gate /* set error handle to zero so it can hold new error report */ 199*0Sstevel@tonic-gate head_va->ehdl = 0; 200*0Sstevel@tonic-gate 201*0Sstevel@tonic-gate aflt = (struct async_flt *)&(errh_flt.cmn_asyncflt); 202*0Sstevel@tonic-gate 203*0Sstevel@tonic-gate trampolined = 0; 204*0Sstevel@tonic-gate 205*0Sstevel@tonic-gate if (errh_flt.errh_er.attr & ERRH_ATTR_PIO) 206*0Sstevel@tonic-gate aflt->flt_class = BUS_FAULT; 207*0Sstevel@tonic-gate else 208*0Sstevel@tonic-gate aflt->flt_class = CPU_FAULT; 209*0Sstevel@tonic-gate 210*0Sstevel@tonic-gate aflt->flt_id = gethrtime(); 211*0Sstevel@tonic-gate aflt->flt_bus_id = getprocessorid(); 212*0Sstevel@tonic-gate aflt->flt_pc = (caddr_t)rp->r_pc; 213*0Sstevel@tonic-gate exec_mode = (errh_flt.errh_er.attr & ERRH_MODE_MASK) 214*0Sstevel@tonic-gate >> ERRH_MODE_SHIFT; 215*0Sstevel@tonic-gate aflt->flt_priv = (exec_mode == ERRH_MODE_PRIV || 216*0Sstevel@tonic-gate exec_mode == ERRH_MODE_UNKNOWN); 217*0Sstevel@tonic-gate aflt->flt_tl = (uchar_t)tl; 218*0Sstevel@tonic-gate aflt->flt_prot = AFLT_PROT_NONE; 219*0Sstevel@tonic-gate aflt->flt_panic = ((aflt->flt_tl != 0) || 220*0Sstevel@tonic-gate (aft_testfatal != 0)); 221*0Sstevel@tonic-gate 222*0Sstevel@tonic-gate switch (errh_flt.errh_er.desc) { 223*0Sstevel@tonic-gate case ERRH_DESC_PR_NRE: 224*0Sstevel@tonic-gate /* 225*0Sstevel@tonic-gate * Fall through, precise fault also need to check 226*0Sstevel@tonic-gate * to see if it was protected. 227*0Sstevel@tonic-gate */ 228*0Sstevel@tonic-gate 229*0Sstevel@tonic-gate case ERRH_DESC_DEF_NRE: 230*0Sstevel@tonic-gate /* 231*0Sstevel@tonic-gate * If the trap occurred in privileged mode at TL=0, 232*0Sstevel@tonic-gate * we need to check to see if we were executing 233*0Sstevel@tonic-gate * in kernel under on_trap() or t_lofault 234*0Sstevel@tonic-gate * protection. If so, modify the saved registers 235*0Sstevel@tonic-gate * so that we return from the trap to the 236*0Sstevel@tonic-gate * appropriate trampoline routine. 237*0Sstevel@tonic-gate */ 238*0Sstevel@tonic-gate if (aflt->flt_priv == 1 && aflt->flt_tl == 0) 239*0Sstevel@tonic-gate trampolined = 240*0Sstevel@tonic-gate errh_error_protected(rp, aflt, &expected); 241*0Sstevel@tonic-gate 242*0Sstevel@tonic-gate if (!aflt->flt_priv || aflt->flt_prot == 243*0Sstevel@tonic-gate AFLT_PROT_COPY) { 244*0Sstevel@tonic-gate aflt->flt_panic |= aft_panic; 245*0Sstevel@tonic-gate } else if (!trampolined && 246*0Sstevel@tonic-gate aflt->flt_class != BUS_FAULT) { 247*0Sstevel@tonic-gate aflt->flt_panic = 1; 248*0Sstevel@tonic-gate } 249*0Sstevel@tonic-gate 250*0Sstevel@tonic-gate /* 251*0Sstevel@tonic-gate * If PIO error, we need to query the bus nexus 252*0Sstevel@tonic-gate * for fatal errors. 253*0Sstevel@tonic-gate */ 254*0Sstevel@tonic-gate if (aflt->flt_class == BUS_FAULT) { 255*0Sstevel@tonic-gate aflt->flt_addr = errh_flt.errh_er.ra; 256*0Sstevel@tonic-gate errh_cpu_run_bus_error_handlers(aflt, 257*0Sstevel@tonic-gate expected); 258*0Sstevel@tonic-gate } 259*0Sstevel@tonic-gate 260*0Sstevel@tonic-gate break; 261*0Sstevel@tonic-gate 262*0Sstevel@tonic-gate default: 263*0Sstevel@tonic-gate cmn_err(CE_WARN, "Error Descriptor 0x%llx " 264*0Sstevel@tonic-gate " invalid in nonresumable error handler", 265*0Sstevel@tonic-gate (long long) errh_flt.errh_er.desc); 266*0Sstevel@tonic-gate continue; 267*0Sstevel@tonic-gate } 268*0Sstevel@tonic-gate 269*0Sstevel@tonic-gate /* 270*0Sstevel@tonic-gate * Queue the error report for further processing. If 271*0Sstevel@tonic-gate * flt_panic is set, code still process other errors 272*0Sstevel@tonic-gate * in the queue until the panic routine stops the 273*0Sstevel@tonic-gate * kernel. 274*0Sstevel@tonic-gate */ 275*0Sstevel@tonic-gate (void) cpu_queue_one_event(&errh_flt); 276*0Sstevel@tonic-gate 277*0Sstevel@tonic-gate /* 278*0Sstevel@tonic-gate * Panic here if aflt->flt_panic has been set. 279*0Sstevel@tonic-gate * Enqueued errors will be logged as part of the panic flow. 280*0Sstevel@tonic-gate */ 281*0Sstevel@tonic-gate if (aflt->flt_panic) { 282*0Sstevel@tonic-gate fm_panic("Unrecoverable hardware error"); 283*0Sstevel@tonic-gate } 284*0Sstevel@tonic-gate 285*0Sstevel@tonic-gate /* 286*0Sstevel@tonic-gate * If it is a memory error, we turn on the PAGE_IS_TOXIC 287*0Sstevel@tonic-gate * flag. The page will be retired later and scrubbed when 288*0Sstevel@tonic-gate * it is freed. 289*0Sstevel@tonic-gate */ 290*0Sstevel@tonic-gate if (errh_flt.errh_er.attr & ERRH_ATTR_MEM) 291*0Sstevel@tonic-gate (void) errh_page_settoxic(&errh_flt, PAGE_IS_TOXIC); 292*0Sstevel@tonic-gate 293*0Sstevel@tonic-gate /* 294*0Sstevel@tonic-gate * If we queued an error and the it was in user mode or 295*0Sstevel@tonic-gate * protected by t_lofault, 296*0Sstevel@tonic-gate * set AST flag so the queue will be drained before 297*0Sstevel@tonic-gate * returning to user mode. 298*0Sstevel@tonic-gate */ 299*0Sstevel@tonic-gate if (!aflt->flt_priv || aflt->flt_prot == AFLT_PROT_COPY) { 300*0Sstevel@tonic-gate int pcb_flag = 0; 301*0Sstevel@tonic-gate 302*0Sstevel@tonic-gate if (aflt->flt_class == CPU_FAULT) 303*0Sstevel@tonic-gate pcb_flag |= ASYNC_HWERR; 304*0Sstevel@tonic-gate else if (aflt->flt_class == BUS_FAULT) 305*0Sstevel@tonic-gate pcb_flag |= ASYNC_BERR; 306*0Sstevel@tonic-gate 307*0Sstevel@tonic-gate ttolwp(curthread)->lwp_pcb.pcb_flags |= pcb_flag; 308*0Sstevel@tonic-gate aston(curthread); 309*0Sstevel@tonic-gate } 310*0Sstevel@tonic-gate } 311*0Sstevel@tonic-gate } 312*0Sstevel@tonic-gate 313*0Sstevel@tonic-gate /* 314*0Sstevel@tonic-gate * For PIO errors, this routine calls nexus driver's error 315*0Sstevel@tonic-gate * callback routines. If the callback routine returns fatal, and 316*0Sstevel@tonic-gate * we are in kernel or unknow mode without any error protection, 317*0Sstevel@tonic-gate * we need to turn on the panic flag. 318*0Sstevel@tonic-gate */ 319*0Sstevel@tonic-gate void 320*0Sstevel@tonic-gate errh_cpu_run_bus_error_handlers(struct async_flt *aflt, int expected) 321*0Sstevel@tonic-gate { 322*0Sstevel@tonic-gate int status; 323*0Sstevel@tonic-gate ddi_fm_error_t de; 324*0Sstevel@tonic-gate 325*0Sstevel@tonic-gate bzero(&de, sizeof (ddi_fm_error_t)); 326*0Sstevel@tonic-gate 327*0Sstevel@tonic-gate de.fme_version = DDI_FME_VERSION; 328*0Sstevel@tonic-gate de.fme_ena = fm_ena_generate(aflt->flt_id, FM_ENA_FMT1); 329*0Sstevel@tonic-gate de.fme_flag = expected; 330*0Sstevel@tonic-gate de.fme_bus_specific = (void *)aflt->flt_addr; 331*0Sstevel@tonic-gate status = ndi_fm_handler_dispatch(ddi_root_node(), NULL, &de); 332*0Sstevel@tonic-gate 333*0Sstevel@tonic-gate /* 334*0Sstevel@tonic-gate * If error is protected, it will jump to proper routine 335*0Sstevel@tonic-gate * to handle the handle; if it is in user level, we just 336*0Sstevel@tonic-gate * kill the user process; if the driver thinks the error is 337*0Sstevel@tonic-gate * not fatal, we can drive on. If none of above are true, 338*0Sstevel@tonic-gate * we panic 339*0Sstevel@tonic-gate */ 340*0Sstevel@tonic-gate if ((aflt->flt_prot == AFLT_PROT_NONE) && (aflt->flt_priv == 1) && 341*0Sstevel@tonic-gate (status == DDI_FM_FATAL)) 342*0Sstevel@tonic-gate aflt->flt_panic = 1; 343*0Sstevel@tonic-gate } 344*0Sstevel@tonic-gate 345*0Sstevel@tonic-gate /* 346*0Sstevel@tonic-gate * This routine checks to see if we are under any error protection when 347*0Sstevel@tonic-gate * the error happens. If we are under error protection, we unwind to 348*0Sstevel@tonic-gate * the protection and indicate fault. 349*0Sstevel@tonic-gate */ 350*0Sstevel@tonic-gate static int 351*0Sstevel@tonic-gate errh_error_protected(struct regs *rp, struct async_flt *aflt, int *expected) 352*0Sstevel@tonic-gate { 353*0Sstevel@tonic-gate int trampolined = 0; 354*0Sstevel@tonic-gate ddi_acc_hdl_t *hp; 355*0Sstevel@tonic-gate 356*0Sstevel@tonic-gate if (curthread->t_ontrap != NULL) { 357*0Sstevel@tonic-gate on_trap_data_t *otp = curthread->t_ontrap; 358*0Sstevel@tonic-gate 359*0Sstevel@tonic-gate if (otp->ot_prot & OT_DATA_EC) { 360*0Sstevel@tonic-gate aflt->flt_prot = AFLT_PROT_EC; 361*0Sstevel@tonic-gate otp->ot_trap |= OT_DATA_EC; 362*0Sstevel@tonic-gate rp->r_pc = otp->ot_trampoline; 363*0Sstevel@tonic-gate rp->r_npc = rp->r_pc +4; 364*0Sstevel@tonic-gate trampolined = 1; 365*0Sstevel@tonic-gate } 366*0Sstevel@tonic-gate 367*0Sstevel@tonic-gate if (otp->ot_prot & OT_DATA_ACCESS) { 368*0Sstevel@tonic-gate aflt->flt_prot = AFLT_PROT_ACCESS; 369*0Sstevel@tonic-gate otp->ot_trap |= OT_DATA_ACCESS; 370*0Sstevel@tonic-gate rp->r_pc = otp->ot_trampoline; 371*0Sstevel@tonic-gate rp->r_npc = rp->r_pc + 4; 372*0Sstevel@tonic-gate trampolined = 1; 373*0Sstevel@tonic-gate /* 374*0Sstevel@tonic-gate * for peek and caut_gets 375*0Sstevel@tonic-gate * errors are expected 376*0Sstevel@tonic-gate */ 377*0Sstevel@tonic-gate hp = (ddi_acc_hdl_t *)otp->ot_handle; 378*0Sstevel@tonic-gate if (!hp) 379*0Sstevel@tonic-gate *expected = DDI_FM_ERR_PEEK; 380*0Sstevel@tonic-gate else if (hp->ah_acc.devacc_attr_access == 381*0Sstevel@tonic-gate DDI_CAUTIOUS_ACC) 382*0Sstevel@tonic-gate *expected = DDI_FM_ERR_EXPECTED; 383*0Sstevel@tonic-gate } 384*0Sstevel@tonic-gate } else if (curthread->t_lofault) { 385*0Sstevel@tonic-gate aflt->flt_prot = AFLT_PROT_COPY; 386*0Sstevel@tonic-gate rp->r_g1 = EFAULT; 387*0Sstevel@tonic-gate rp->r_pc = curthread->t_lofault; 388*0Sstevel@tonic-gate rp->r_npc = rp->r_pc + 4; 389*0Sstevel@tonic-gate trampolined = 1; 390*0Sstevel@tonic-gate } 391*0Sstevel@tonic-gate 392*0Sstevel@tonic-gate return (trampolined); 393*0Sstevel@tonic-gate } 394*0Sstevel@tonic-gate 395*0Sstevel@tonic-gate /* 396*0Sstevel@tonic-gate * Queue one event. 397*0Sstevel@tonic-gate */ 398*0Sstevel@tonic-gate static void 399*0Sstevel@tonic-gate cpu_queue_one_event(errh_async_flt_t *errh_fltp) 400*0Sstevel@tonic-gate { 401*0Sstevel@tonic-gate struct async_flt *aflt = (struct async_flt *)errh_fltp; 402*0Sstevel@tonic-gate errorq_t *eqp; 403*0Sstevel@tonic-gate 404*0Sstevel@tonic-gate if (aflt->flt_panic) 405*0Sstevel@tonic-gate eqp = ue_queue; 406*0Sstevel@tonic-gate else 407*0Sstevel@tonic-gate eqp = ce_queue; 408*0Sstevel@tonic-gate 409*0Sstevel@tonic-gate errorq_dispatch(eqp, errh_fltp, sizeof (errh_async_flt_t), 410*0Sstevel@tonic-gate aflt->flt_panic); 411*0Sstevel@tonic-gate } 412*0Sstevel@tonic-gate 413*0Sstevel@tonic-gate /* 414*0Sstevel@tonic-gate * The cpu_async_log_err() function is called by the ce/ue_drain() function to 415*0Sstevel@tonic-gate * handle logging for CPU events that are dequeued. As such, it can be invoked 416*0Sstevel@tonic-gate * from softint context, from AST processing in the trap() flow, or from the 417*0Sstevel@tonic-gate * panic flow. We decode the CPU-specific data, and log appropriate messages. 418*0Sstevel@tonic-gate */ 419*0Sstevel@tonic-gate void 420*0Sstevel@tonic-gate cpu_async_log_err(void *flt) 421*0Sstevel@tonic-gate { 422*0Sstevel@tonic-gate errh_async_flt_t *errh_fltp = (errh_async_flt_t *)flt; 423*0Sstevel@tonic-gate errh_er_t *errh_erp = (errh_er_t *)&errh_fltp->errh_er; 424*0Sstevel@tonic-gate 425*0Sstevel@tonic-gate switch (errh_erp->desc) { 426*0Sstevel@tonic-gate case ERRH_DESC_UCOR_RE: 427*0Sstevel@tonic-gate if (errh_erp->attr & ERRH_ATTR_MEM) { 428*0Sstevel@tonic-gate /* 429*0Sstevel@tonic-gate * Turn on the PAGE_IS_TOXIC flag. The page will be 430*0Sstevel@tonic-gate * scrubbed when it is freed. 431*0Sstevel@tonic-gate */ 432*0Sstevel@tonic-gate (void) errh_page_settoxic(errh_fltp, PAGE_IS_TOXIC); 433*0Sstevel@tonic-gate } 434*0Sstevel@tonic-gate 435*0Sstevel@tonic-gate break; 436*0Sstevel@tonic-gate 437*0Sstevel@tonic-gate case ERRH_DESC_PR_NRE: 438*0Sstevel@tonic-gate case ERRH_DESC_DEF_NRE: 439*0Sstevel@tonic-gate if (errh_erp->attr & ERRH_ATTR_MEM) { 440*0Sstevel@tonic-gate /* 441*0Sstevel@tonic-gate * For non-resumable memory error, retire 442*0Sstevel@tonic-gate * the page here. 443*0Sstevel@tonic-gate */ 444*0Sstevel@tonic-gate errh_page_retire(errh_fltp); 445*0Sstevel@tonic-gate } 446*0Sstevel@tonic-gate break; 447*0Sstevel@tonic-gate 448*0Sstevel@tonic-gate default: 449*0Sstevel@tonic-gate break; 450*0Sstevel@tonic-gate } 451*0Sstevel@tonic-gate } 452*0Sstevel@tonic-gate 453*0Sstevel@tonic-gate /* 454*0Sstevel@tonic-gate * Called from ce_drain(). 455*0Sstevel@tonic-gate */ 456*0Sstevel@tonic-gate void 457*0Sstevel@tonic-gate cpu_ce_log_err(struct async_flt *aflt) 458*0Sstevel@tonic-gate { 459*0Sstevel@tonic-gate switch (aflt->flt_class) { 460*0Sstevel@tonic-gate case CPU_FAULT: 461*0Sstevel@tonic-gate cpu_async_log_err(aflt); 462*0Sstevel@tonic-gate break; 463*0Sstevel@tonic-gate 464*0Sstevel@tonic-gate case BUS_FAULT: 465*0Sstevel@tonic-gate cpu_async_log_err(aflt); 466*0Sstevel@tonic-gate break; 467*0Sstevel@tonic-gate 468*0Sstevel@tonic-gate default: 469*0Sstevel@tonic-gate break; 470*0Sstevel@tonic-gate } 471*0Sstevel@tonic-gate } 472*0Sstevel@tonic-gate 473*0Sstevel@tonic-gate /* 474*0Sstevel@tonic-gate * Called from ue_drain(). 475*0Sstevel@tonic-gate */ 476*0Sstevel@tonic-gate void 477*0Sstevel@tonic-gate cpu_ue_log_err(struct async_flt *aflt) 478*0Sstevel@tonic-gate { 479*0Sstevel@tonic-gate switch (aflt->flt_class) { 480*0Sstevel@tonic-gate case CPU_FAULT: 481*0Sstevel@tonic-gate cpu_async_log_err(aflt); 482*0Sstevel@tonic-gate break; 483*0Sstevel@tonic-gate 484*0Sstevel@tonic-gate case BUS_FAULT: 485*0Sstevel@tonic-gate cpu_async_log_err(aflt); 486*0Sstevel@tonic-gate break; 487*0Sstevel@tonic-gate 488*0Sstevel@tonic-gate default: 489*0Sstevel@tonic-gate break; 490*0Sstevel@tonic-gate } 491*0Sstevel@tonic-gate } 492*0Sstevel@tonic-gate 493*0Sstevel@tonic-gate /* 494*0Sstevel@tonic-gate * Turn on flag on the error memory region. 495*0Sstevel@tonic-gate */ 496*0Sstevel@tonic-gate static void 497*0Sstevel@tonic-gate errh_page_settoxic(errh_async_flt_t *errh_fltp, uchar_t flag) 498*0Sstevel@tonic-gate { 499*0Sstevel@tonic-gate page_t *pp; 500*0Sstevel@tonic-gate uint64_t flt_real_addr_start = errh_fltp->errh_er.ra; 501*0Sstevel@tonic-gate uint64_t flt_real_addr_end = flt_real_addr_start + 502*0Sstevel@tonic-gate errh_fltp->errh_er.sz - 1; 503*0Sstevel@tonic-gate int64_t current_addr; 504*0Sstevel@tonic-gate 505*0Sstevel@tonic-gate if (errh_fltp->errh_er.sz == 0) 506*0Sstevel@tonic-gate return; 507*0Sstevel@tonic-gate 508*0Sstevel@tonic-gate for (current_addr = flt_real_addr_start; 509*0Sstevel@tonic-gate current_addr < flt_real_addr_end; current_addr += MMU_PAGESIZE) { 510*0Sstevel@tonic-gate pp = page_numtopp_nolock((pfn_t) 511*0Sstevel@tonic-gate (current_addr >> MMU_PAGESHIFT)); 512*0Sstevel@tonic-gate 513*0Sstevel@tonic-gate if (pp != NULL) { 514*0Sstevel@tonic-gate page_settoxic(pp, flag); 515*0Sstevel@tonic-gate } 516*0Sstevel@tonic-gate } 517*0Sstevel@tonic-gate } 518*0Sstevel@tonic-gate 519*0Sstevel@tonic-gate /* 520*0Sstevel@tonic-gate * Retire the page(s) indicated in the error report. 521*0Sstevel@tonic-gate */ 522*0Sstevel@tonic-gate static void 523*0Sstevel@tonic-gate errh_page_retire(errh_async_flt_t *errh_fltp) 524*0Sstevel@tonic-gate { 525*0Sstevel@tonic-gate page_t *pp; 526*0Sstevel@tonic-gate uint64_t flt_real_addr_start = errh_fltp->errh_er.ra; 527*0Sstevel@tonic-gate uint64_t flt_real_addr_end = flt_real_addr_start + 528*0Sstevel@tonic-gate errh_fltp->errh_er.sz - 1; 529*0Sstevel@tonic-gate int64_t current_addr; 530*0Sstevel@tonic-gate 531*0Sstevel@tonic-gate if (errh_fltp->errh_er.sz == 0) 532*0Sstevel@tonic-gate return; 533*0Sstevel@tonic-gate 534*0Sstevel@tonic-gate for (current_addr = flt_real_addr_start; 535*0Sstevel@tonic-gate current_addr < flt_real_addr_end; current_addr += MMU_PAGESIZE) { 536*0Sstevel@tonic-gate pp = page_numtopp_nolock((pfn_t) 537*0Sstevel@tonic-gate (current_addr >> MMU_PAGESHIFT)); 538*0Sstevel@tonic-gate 539*0Sstevel@tonic-gate if (pp != NULL) { 540*0Sstevel@tonic-gate (void) page_retire(pp, PAGE_IS_TOXIC); 541*0Sstevel@tonic-gate } 542*0Sstevel@tonic-gate } 543*0Sstevel@tonic-gate } 544*0Sstevel@tonic-gate 545*0Sstevel@tonic-gate void 546*0Sstevel@tonic-gate mem_scrub(uint64_t paddr, uint64_t len) 547*0Sstevel@tonic-gate { 548*0Sstevel@tonic-gate uint64_t pa, length, scrubbed_len; 549*0Sstevel@tonic-gate uint64_t ret = H_EOK; 550*0Sstevel@tonic-gate 551*0Sstevel@tonic-gate pa = paddr; 552*0Sstevel@tonic-gate length = len; 553*0Sstevel@tonic-gate scrubbed_len = 0; 554*0Sstevel@tonic-gate 555*0Sstevel@tonic-gate while (ret == H_EOK) { 556*0Sstevel@tonic-gate ret = hv_mem_scrub(pa, length, &scrubbed_len); 557*0Sstevel@tonic-gate 558*0Sstevel@tonic-gate if (ret == H_EOK || scrubbed_len >= length) { 559*0Sstevel@tonic-gate break; 560*0Sstevel@tonic-gate } 561*0Sstevel@tonic-gate 562*0Sstevel@tonic-gate pa += scrubbed_len; 563*0Sstevel@tonic-gate length -= scrubbed_len; 564*0Sstevel@tonic-gate } 565*0Sstevel@tonic-gate } 566*0Sstevel@tonic-gate 567*0Sstevel@tonic-gate void 568*0Sstevel@tonic-gate mem_sync(caddr_t va, size_t len) 569*0Sstevel@tonic-gate { 570*0Sstevel@tonic-gate uint64_t pa, length, flushed; 571*0Sstevel@tonic-gate uint64_t ret = H_EOK; 572*0Sstevel@tonic-gate 573*0Sstevel@tonic-gate pa = va_to_pa((caddr_t)va); 574*0Sstevel@tonic-gate 575*0Sstevel@tonic-gate if (pa == (uint64_t)-1) 576*0Sstevel@tonic-gate return; 577*0Sstevel@tonic-gate 578*0Sstevel@tonic-gate length = len; 579*0Sstevel@tonic-gate flushed = 0; 580*0Sstevel@tonic-gate 581*0Sstevel@tonic-gate while (ret == H_EOK) { 582*0Sstevel@tonic-gate ret = hv_mem_sync(pa, length, &flushed); 583*0Sstevel@tonic-gate 584*0Sstevel@tonic-gate if (ret == H_EOK || flushed >= length) { 585*0Sstevel@tonic-gate break; 586*0Sstevel@tonic-gate } 587*0Sstevel@tonic-gate 588*0Sstevel@tonic-gate pa += flushed; 589*0Sstevel@tonic-gate length -= flushed; 590*0Sstevel@tonic-gate } 591*0Sstevel@tonic-gate } 592*0Sstevel@tonic-gate 593*0Sstevel@tonic-gate /* 594*0Sstevel@tonic-gate * If resumable queue is full, we need to check if any cpu is in 595*0Sstevel@tonic-gate * error state. If not, we drive on. If yes, we need to panic. The 596*0Sstevel@tonic-gate * hypervisor call hv_cpu_state() is being used for checking the 597*0Sstevel@tonic-gate * cpu state. 598*0Sstevel@tonic-gate */ 599*0Sstevel@tonic-gate static void 600*0Sstevel@tonic-gate errh_rq_full(struct async_flt *afltp) 601*0Sstevel@tonic-gate { 602*0Sstevel@tonic-gate processorid_t who; 603*0Sstevel@tonic-gate uint64_t cpu_state; 604*0Sstevel@tonic-gate uint64_t retval; 605*0Sstevel@tonic-gate 606*0Sstevel@tonic-gate for (who = 0; who < NCPU; who++) 607*0Sstevel@tonic-gate if (CPU_IN_SET(cpu_ready_set, who)) { 608*0Sstevel@tonic-gate retval = hv_cpu_state(who, &cpu_state); 609*0Sstevel@tonic-gate if (retval != H_EOK || cpu_state == CPU_STATE_ERROR) { 610*0Sstevel@tonic-gate afltp->flt_panic = 1; 611*0Sstevel@tonic-gate break; 612*0Sstevel@tonic-gate } 613*0Sstevel@tonic-gate } 614*0Sstevel@tonic-gate } 615*0Sstevel@tonic-gate 616*0Sstevel@tonic-gate /* 617*0Sstevel@tonic-gate * Return processor specific async error structure 618*0Sstevel@tonic-gate * size used. 619*0Sstevel@tonic-gate */ 620*0Sstevel@tonic-gate int 621*0Sstevel@tonic-gate cpu_aflt_size(void) 622*0Sstevel@tonic-gate { 623*0Sstevel@tonic-gate return (sizeof (errh_async_flt_t)); 624*0Sstevel@tonic-gate } 625*0Sstevel@tonic-gate 626*0Sstevel@tonic-gate #define SZ_TO_ETRS_SHIFT 6 627*0Sstevel@tonic-gate 628*0Sstevel@tonic-gate /* 629*0Sstevel@tonic-gate * Message print out when resumable queue is overflown 630*0Sstevel@tonic-gate */ 631*0Sstevel@tonic-gate /*ARGSUSED*/ 632*0Sstevel@tonic-gate void 633*0Sstevel@tonic-gate rq_overflow(struct regs *rp, uint64_t head_offset, 634*0Sstevel@tonic-gate uint64_t tail_offset) 635*0Sstevel@tonic-gate { 636*0Sstevel@tonic-gate rq_overflow_count++; 637*0Sstevel@tonic-gate } 638*0Sstevel@tonic-gate 639*0Sstevel@tonic-gate /* 640*0Sstevel@tonic-gate * Handler to process a fatal error. This routine can be called from a 641*0Sstevel@tonic-gate * softint, called from trap()'s AST handling, or called from the panic flow. 642*0Sstevel@tonic-gate */ 643*0Sstevel@tonic-gate /*ARGSUSED*/ 644*0Sstevel@tonic-gate static void 645*0Sstevel@tonic-gate ue_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep) 646*0Sstevel@tonic-gate { 647*0Sstevel@tonic-gate cpu_ue_log_err(aflt); 648*0Sstevel@tonic-gate } 649*0Sstevel@tonic-gate 650*0Sstevel@tonic-gate /* 651*0Sstevel@tonic-gate * Handler to process a correctable error. This routine can be called from a 652*0Sstevel@tonic-gate * softint. We just call the CPU module's logging routine. 653*0Sstevel@tonic-gate */ 654*0Sstevel@tonic-gate /*ARGSUSED*/ 655*0Sstevel@tonic-gate static void 656*0Sstevel@tonic-gate ce_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep) 657*0Sstevel@tonic-gate { 658*0Sstevel@tonic-gate cpu_ce_log_err(aflt); 659*0Sstevel@tonic-gate } 660*0Sstevel@tonic-gate 661*0Sstevel@tonic-gate /* 662*0Sstevel@tonic-gate * Allocate error queue sizes based on max_ncpus. max_ncpus is set just 663*0Sstevel@tonic-gate * after ncpunode has been determined. ncpus is set in start_other_cpus 664*0Sstevel@tonic-gate * which is called after error_init() but may change dynamically. 665*0Sstevel@tonic-gate */ 666*0Sstevel@tonic-gate void 667*0Sstevel@tonic-gate error_init(void) 668*0Sstevel@tonic-gate { 669*0Sstevel@tonic-gate char tmp_name[MAXSYSNAME]; 670*0Sstevel@tonic-gate dnode_t node; 671*0Sstevel@tonic-gate size_t size = cpu_aflt_size(); 672*0Sstevel@tonic-gate 673*0Sstevel@tonic-gate /* 674*0Sstevel@tonic-gate * Initialize the correctable and uncorrectable error queues. 675*0Sstevel@tonic-gate */ 676*0Sstevel@tonic-gate ue_queue = errorq_create("ue_queue", (errorq_func_t)ue_drain, NULL, 677*0Sstevel@tonic-gate MAX_ASYNC_FLTS * (max_ncpus + 1), size, PIL_2, ERRORQ_VITAL); 678*0Sstevel@tonic-gate 679*0Sstevel@tonic-gate ce_queue = errorq_create("ce_queue", (errorq_func_t)ce_drain, NULL, 680*0Sstevel@tonic-gate MAX_CE_FLTS * (max_ncpus + 1), size, PIL_1, 0); 681*0Sstevel@tonic-gate 682*0Sstevel@tonic-gate if (ue_queue == NULL || ce_queue == NULL) 683*0Sstevel@tonic-gate panic("failed to create required system error queue"); 684*0Sstevel@tonic-gate 685*0Sstevel@tonic-gate /* 686*0Sstevel@tonic-gate * Initialize the busfunc list mutex. This must be a PIL_15 spin lock 687*0Sstevel@tonic-gate * because we will need to acquire it from cpu_async_error(). 688*0Sstevel@tonic-gate */ 689*0Sstevel@tonic-gate mutex_init(&bfd_lock, NULL, MUTEX_SPIN, (void *)PIL_15); 690*0Sstevel@tonic-gate 691*0Sstevel@tonic-gate node = prom_rootnode(); 692*0Sstevel@tonic-gate if ((node == OBP_NONODE) || (node == OBP_BADNODE)) { 693*0Sstevel@tonic-gate cmn_err(CE_CONT, "error_init: node 0x%x\n", (uint_t)node); 694*0Sstevel@tonic-gate return; 695*0Sstevel@tonic-gate } 696*0Sstevel@tonic-gate 697*0Sstevel@tonic-gate if (((size = prom_getproplen(node, "reset-reason")) != -1) && 698*0Sstevel@tonic-gate (size <= MAXSYSNAME) && 699*0Sstevel@tonic-gate (prom_getprop(node, "reset-reason", tmp_name) != -1)) { 700*0Sstevel@tonic-gate if (reset_debug) { 701*0Sstevel@tonic-gate cmn_err(CE_CONT, "System booting after %s\n", tmp_name); 702*0Sstevel@tonic-gate } else if (strncmp(tmp_name, "FATAL", 5) == 0) { 703*0Sstevel@tonic-gate cmn_err(CE_CONT, 704*0Sstevel@tonic-gate "System booting after fatal error %s\n", tmp_name); 705*0Sstevel@tonic-gate } 706*0Sstevel@tonic-gate } 707*0Sstevel@tonic-gate } 708