xref: /onnv-gate/usr/src/uts/sun4v/os/error.c (revision 8574:34b33d2c8168)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
52181Sayznaga  * Common Development and Distribution License (the "License").
62181Sayznaga  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
22*8574SJason.Beloro@Sun.COM  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate #include <sys/types.h>
270Sstevel@tonic-gate #include <sys/machsystm.h>
283199Sep32863 #include <sys/sysmacros.h>
290Sstevel@tonic-gate #include <sys/cpuvar.h>
300Sstevel@tonic-gate #include <sys/async.h>
310Sstevel@tonic-gate #include <sys/ontrap.h>
320Sstevel@tonic-gate #include <sys/ddifm.h>
330Sstevel@tonic-gate #include <sys/hypervisor_api.h>
340Sstevel@tonic-gate #include <sys/errorq.h>
350Sstevel@tonic-gate #include <sys/promif.h>
360Sstevel@tonic-gate #include <sys/prom_plat.h>
370Sstevel@tonic-gate #include <sys/x_call.h>
380Sstevel@tonic-gate #include <sys/error.h>
390Sstevel@tonic-gate #include <sys/fm/util.h>
40541Srf157361 #include <sys/ivintr.h>
417718SJason.Beloro@Sun.COM #include <sys/machasi.h>
427718SJason.Beloro@Sun.COM #include <sys/mmu.h>
433156Sgirish #include <sys/archsystm.h>
440Sstevel@tonic-gate 
450Sstevel@tonic-gate #define	MAX_CE_FLTS		10
460Sstevel@tonic-gate #define	MAX_ASYNC_FLTS		6
470Sstevel@tonic-gate 
480Sstevel@tonic-gate errorq_t *ue_queue;			/* queue of uncorrectable errors */
490Sstevel@tonic-gate errorq_t *ce_queue;			/* queue of correctable errors */
500Sstevel@tonic-gate 
510Sstevel@tonic-gate /*
520Sstevel@tonic-gate  * Being used by memory test driver.
530Sstevel@tonic-gate  * ce_verbose_memory - covers CEs in DIMMs
540Sstevel@tonic-gate  * ce_verbose_other - covers "others" (ecache, IO, etc.)
550Sstevel@tonic-gate  *
560Sstevel@tonic-gate  * If the value is 0, nothing is logged.
570Sstevel@tonic-gate  * If the value is 1, the error is logged to the log file, but not console.
580Sstevel@tonic-gate  * If the value is 2, the error is logged to the log file and console.
590Sstevel@tonic-gate  */
600Sstevel@tonic-gate int	ce_verbose_memory = 1;
610Sstevel@tonic-gate int	ce_verbose_other = 1;
620Sstevel@tonic-gate 
630Sstevel@tonic-gate int	ce_show_data = 0;
640Sstevel@tonic-gate int	ce_debug = 0;
650Sstevel@tonic-gate int	ue_debug = 0;
660Sstevel@tonic-gate int	reset_debug = 0;
670Sstevel@tonic-gate 
680Sstevel@tonic-gate /*
690Sstevel@tonic-gate  * Tunables for controlling the handling of asynchronous faults (AFTs). Setting
700Sstevel@tonic-gate  * these to non-default values on a non-DEBUG kernel is NOT supported.
710Sstevel@tonic-gate  */
720Sstevel@tonic-gate int	aft_verbose = 0;	/* log AFT messages > 1 to log only */
730Sstevel@tonic-gate int	aft_panic = 0;		/* panic (not reboot) on fatal usermode AFLT */
740Sstevel@tonic-gate int	aft_testfatal = 0;	/* force all AFTs to panic immediately */
750Sstevel@tonic-gate 
760Sstevel@tonic-gate /*
774612Srf157361  * Used for vbsc hostshutdown (power-off button)
78541Srf157361  */
79541Srf157361 int	err_shutdown_triggered = 0;	/* only once */
802973Sgovinda uint64_t err_shutdown_inum = 0;	/* used to pull the trigger */
81541Srf157361 
82541Srf157361 /*
834612Srf157361  * Used to print NRE/RE via system variable or kmdb
844612Srf157361  */
854612Srf157361 int		printerrh = 0;		/* see /etc/system */
864612Srf157361 static void	errh_er_print(errh_er_t *, const char *);
874612Srf157361 kmutex_t	errh_print_lock;
884612Srf157361 
894612Srf157361 /*
900Sstevel@tonic-gate  * Defined in bus_func.c but initialised in error_init
910Sstevel@tonic-gate  */
920Sstevel@tonic-gate extern kmutex_t bfd_lock;
930Sstevel@tonic-gate 
940Sstevel@tonic-gate static uint32_t rq_overflow_count = 0;		/* counter for rq overflow */
950Sstevel@tonic-gate 
960Sstevel@tonic-gate static void cpu_queue_one_event(errh_async_flt_t *);
970Sstevel@tonic-gate static uint32_t count_entries_on_queue(uint64_t, uint64_t, uint32_t);
98917Selowe static void errh_page_retire(errh_async_flt_t *, uchar_t);
990Sstevel@tonic-gate static int errh_error_protected(struct regs *, struct async_flt *, int *);
1000Sstevel@tonic-gate static void errh_rq_full(struct async_flt *);
1010Sstevel@tonic-gate static void ue_drain(void *, struct async_flt *, errorq_elem_t *);
1020Sstevel@tonic-gate static void ce_drain(void *, struct async_flt *, errorq_elem_t *);
1033156Sgirish static void errh_handle_attr(errh_async_flt_t *);
1043156Sgirish static void errh_handle_asr(errh_async_flt_t *);
1050Sstevel@tonic-gate 
1060Sstevel@tonic-gate /*ARGSUSED*/
1070Sstevel@tonic-gate void
1080Sstevel@tonic-gate process_resumable_error(struct regs *rp, uint32_t head_offset,
1090Sstevel@tonic-gate     uint32_t tail_offset)
1100Sstevel@tonic-gate {
1110Sstevel@tonic-gate 	struct machcpu *mcpup;
1120Sstevel@tonic-gate 	struct async_flt *aflt;
1130Sstevel@tonic-gate 	errh_async_flt_t errh_flt;
1140Sstevel@tonic-gate 	errh_er_t *head_va;
1150Sstevel@tonic-gate 
1160Sstevel@tonic-gate 	mcpup = &(CPU->cpu_m);
1170Sstevel@tonic-gate 
1180Sstevel@tonic-gate 	while (head_offset != tail_offset) {
1190Sstevel@tonic-gate 		/* kernel buffer starts right after the resumable queue */
1200Sstevel@tonic-gate 		head_va = (errh_er_t *)(mcpup->cpu_rq_va + head_offset +
1210Sstevel@tonic-gate 		    CPU_RQ_SIZE);
1220Sstevel@tonic-gate 		/* Copy the error report to local buffer */
1230Sstevel@tonic-gate 		bzero(&errh_flt, sizeof (errh_async_flt_t));
1240Sstevel@tonic-gate 		bcopy((char *)head_va, &(errh_flt.errh_er),
1250Sstevel@tonic-gate 		    sizeof (errh_er_t));
1260Sstevel@tonic-gate 
1274612Srf157361 		mcpup->cpu_rq_lastre = head_va;
1284612Srf157361 		if (printerrh)
1294612Srf157361 			errh_er_print(&errh_flt.errh_er, "RQ");
1304612Srf157361 
1310Sstevel@tonic-gate 		/* Increment the queue head */
1320Sstevel@tonic-gate 		head_offset += Q_ENTRY_SIZE;
1330Sstevel@tonic-gate 		/* Wrap around */
1340Sstevel@tonic-gate 		head_offset &= (CPU_RQ_SIZE - 1);
1350Sstevel@tonic-gate 
1360Sstevel@tonic-gate 		/* set error handle to zero so it can hold new error report */
1370Sstevel@tonic-gate 		head_va->ehdl = 0;
1380Sstevel@tonic-gate 
1390Sstevel@tonic-gate 		switch (errh_flt.errh_er.desc) {
1400Sstevel@tonic-gate 		case ERRH_DESC_UCOR_RE:
1413156Sgirish 			/*
1423156Sgirish 			 * Check error attribute, handle individual error
1433156Sgirish 			 * if it is needed.
1443156Sgirish 			 */
1453156Sgirish 			errh_handle_attr(&errh_flt);
1460Sstevel@tonic-gate 			break;
1470Sstevel@tonic-gate 
148541Srf157361 		case ERRH_DESC_WARN_RE:
149541Srf157361 			/*
150541Srf157361 			 * Power-off requested, but handle it one time only.
151541Srf157361 			 */
152541Srf157361 			if (!err_shutdown_triggered) {
153541Srf157361 				setsoftint(err_shutdown_inum);
154541Srf157361 				++err_shutdown_triggered;
155541Srf157361 			}
156541Srf157361 			continue;
157541Srf157361 
1580Sstevel@tonic-gate 		default:
1590Sstevel@tonic-gate 			cmn_err(CE_WARN, "Error Descriptor 0x%llx "
1600Sstevel@tonic-gate 			    " invalid in resumable error handler",
1610Sstevel@tonic-gate 			    (long long) errh_flt.errh_er.desc);
1620Sstevel@tonic-gate 			continue;
1630Sstevel@tonic-gate 		}
1640Sstevel@tonic-gate 
1650Sstevel@tonic-gate 		aflt = (struct async_flt *)&(errh_flt.cmn_asyncflt);
1660Sstevel@tonic-gate 		aflt->flt_id = gethrtime();
1670Sstevel@tonic-gate 		aflt->flt_bus_id = getprocessorid();
1680Sstevel@tonic-gate 		aflt->flt_class = CPU_FAULT;
1690Sstevel@tonic-gate 		aflt->flt_prot = AFLT_PROT_NONE;
1700Sstevel@tonic-gate 		aflt->flt_priv = (((errh_flt.errh_er.attr & ERRH_MODE_MASK)
1710Sstevel@tonic-gate 		    >> ERRH_MODE_SHIFT) == ERRH_MODE_PRIV);
1720Sstevel@tonic-gate 
1730Sstevel@tonic-gate 		if (errh_flt.errh_er.attr & ERRH_ATTR_CPU)
1740Sstevel@tonic-gate 			/* If it is an error on other cpu */
1750Sstevel@tonic-gate 			aflt->flt_panic = 1;
1760Sstevel@tonic-gate 		else
1770Sstevel@tonic-gate 			aflt->flt_panic = 0;
1780Sstevel@tonic-gate 
1790Sstevel@tonic-gate 		/*
1800Sstevel@tonic-gate 		 * Handle resumable queue full case.
1810Sstevel@tonic-gate 		 */
1820Sstevel@tonic-gate 		if (errh_flt.errh_er.attr & ERRH_ATTR_RQF) {
1830Sstevel@tonic-gate 			(void) errh_rq_full(aflt);
1840Sstevel@tonic-gate 		}
1850Sstevel@tonic-gate 
1860Sstevel@tonic-gate 		/*
1870Sstevel@tonic-gate 		 * Queue the error on ce or ue queue depend on flt_panic.
1880Sstevel@tonic-gate 		 * Even if flt_panic is set, the code still keep processing
1890Sstevel@tonic-gate 		 * the rest element on rq until the panic starts.
1900Sstevel@tonic-gate 		 */
1910Sstevel@tonic-gate 		(void) cpu_queue_one_event(&errh_flt);
1920Sstevel@tonic-gate 
1930Sstevel@tonic-gate 		/*
1940Sstevel@tonic-gate 		 * Panic here if aflt->flt_panic has been set.
1950Sstevel@tonic-gate 		 * Enqueued errors will be logged as part of the panic flow.
1960Sstevel@tonic-gate 		 */
1970Sstevel@tonic-gate 		if (aflt->flt_panic) {
1980Sstevel@tonic-gate 			fm_panic("Unrecoverable error on another CPU");
1990Sstevel@tonic-gate 		}
2000Sstevel@tonic-gate 	}
2010Sstevel@tonic-gate }
2020Sstevel@tonic-gate 
2030Sstevel@tonic-gate void
2041457Swh94709 process_nonresumable_error(struct regs *rp, uint64_t flags,
2050Sstevel@tonic-gate     uint32_t head_offset, uint32_t tail_offset)
2060Sstevel@tonic-gate {
2070Sstevel@tonic-gate 	struct machcpu *mcpup;
2080Sstevel@tonic-gate 	struct async_flt *aflt;
2090Sstevel@tonic-gate 	errh_async_flt_t errh_flt;
2100Sstevel@tonic-gate 	errh_er_t *head_va;
2110Sstevel@tonic-gate 	int trampolined = 0;
2120Sstevel@tonic-gate 	int expected = DDI_FM_ERR_UNEXPECTED;
2130Sstevel@tonic-gate 	uint64_t exec_mode;
2141457Swh94709 	uint8_t u_spill_fill;
2157718SJason.Beloro@Sun.COM 	int u_kill = 1;
2160Sstevel@tonic-gate 
2170Sstevel@tonic-gate 	mcpup = &(CPU->cpu_m);
2180Sstevel@tonic-gate 
2190Sstevel@tonic-gate 	while (head_offset != tail_offset) {
2200Sstevel@tonic-gate 		/* kernel buffer starts right after the nonresumable queue */
2210Sstevel@tonic-gate 		head_va = (errh_er_t *)(mcpup->cpu_nrq_va + head_offset +
2220Sstevel@tonic-gate 		    CPU_NRQ_SIZE);
2230Sstevel@tonic-gate 
2240Sstevel@tonic-gate 		/* Copy the error report to local buffer */
2250Sstevel@tonic-gate 		bzero(&errh_flt, sizeof (errh_async_flt_t));
2260Sstevel@tonic-gate 
2270Sstevel@tonic-gate 		bcopy((char *)head_va, &(errh_flt.errh_er),
2280Sstevel@tonic-gate 		    sizeof (errh_er_t));
2290Sstevel@tonic-gate 
2304612Srf157361 		mcpup->cpu_nrq_lastnre = head_va;
2314612Srf157361 		if (printerrh)
2324612Srf157361 			errh_er_print(&errh_flt.errh_er, "NRQ");
2334612Srf157361 
2340Sstevel@tonic-gate 		/* Increment the queue head */
2350Sstevel@tonic-gate 		head_offset += Q_ENTRY_SIZE;
2360Sstevel@tonic-gate 		/* Wrap around */
2370Sstevel@tonic-gate 		head_offset &= (CPU_NRQ_SIZE - 1);
2380Sstevel@tonic-gate 
2390Sstevel@tonic-gate 		/* set error handle to zero so it can hold new error report */
2400Sstevel@tonic-gate 		head_va->ehdl = 0;
2410Sstevel@tonic-gate 
2420Sstevel@tonic-gate 		aflt = (struct async_flt *)&(errh_flt.cmn_asyncflt);
2430Sstevel@tonic-gate 
2440Sstevel@tonic-gate 		trampolined = 0;
2450Sstevel@tonic-gate 
2460Sstevel@tonic-gate 		if (errh_flt.errh_er.attr & ERRH_ATTR_PIO)
2470Sstevel@tonic-gate 			aflt->flt_class = BUS_FAULT;
2480Sstevel@tonic-gate 		else
2490Sstevel@tonic-gate 			aflt->flt_class = CPU_FAULT;
2500Sstevel@tonic-gate 
2510Sstevel@tonic-gate 		aflt->flt_id = gethrtime();
2520Sstevel@tonic-gate 		aflt->flt_bus_id = getprocessorid();
2530Sstevel@tonic-gate 		aflt->flt_pc = (caddr_t)rp->r_pc;
2540Sstevel@tonic-gate 		exec_mode = (errh_flt.errh_er.attr & ERRH_MODE_MASK)
2550Sstevel@tonic-gate 		    >> ERRH_MODE_SHIFT;
2560Sstevel@tonic-gate 		aflt->flt_priv = (exec_mode == ERRH_MODE_PRIV ||
2570Sstevel@tonic-gate 		    exec_mode == ERRH_MODE_UNKNOWN);
2580Sstevel@tonic-gate 		aflt->flt_prot = AFLT_PROT_NONE;
2591457Swh94709 		aflt->flt_tl = (uchar_t)(flags & ERRH_TL_MASK);
2600Sstevel@tonic-gate 		aflt->flt_panic = ((aflt->flt_tl != 0) ||
2610Sstevel@tonic-gate 		    (aft_testfatal != 0));
2620Sstevel@tonic-gate 
2631457Swh94709 		/*
2641457Swh94709 		 * For the first error packet on the queue, check if it
2651457Swh94709 		 * happened in user fill/spill trap.
2661457Swh94709 		 */
2671457Swh94709 		if (flags & ERRH_U_SPILL_FILL) {
2681457Swh94709 			u_spill_fill = 1;
2691457Swh94709 			/* clear the user fill/spill flag in flags */
2701457Swh94709 			flags = (uint64_t)aflt->flt_tl;
2711457Swh94709 		} else
2721457Swh94709 			u_spill_fill = 0;
2731457Swh94709 
2740Sstevel@tonic-gate 		switch (errh_flt.errh_er.desc) {
2750Sstevel@tonic-gate 		case ERRH_DESC_PR_NRE:
2761457Swh94709 			if (u_spill_fill) {
2771457Swh94709 				aflt->flt_panic = 0;
2781457Swh94709 				break;
2791457Swh94709 			}
2800Sstevel@tonic-gate 			/*
2817718SJason.Beloro@Sun.COM 			 * Context Register Parity - for reload of secondary
282*8574SJason.Beloro@Sun.COM 			 * context register, see nonresumable_error.
2837718SJason.Beloro@Sun.COM 			 */
2847718SJason.Beloro@Sun.COM 			if ((errh_flt.errh_er.attr & ERRH_ATTR_ASI) &&
285*8574SJason.Beloro@Sun.COM 			    (errh_flt.errh_er.asi == ASI_MMU_CTX)) {
2867718SJason.Beloro@Sun.COM 
2877718SJason.Beloro@Sun.COM 				if (aflt->flt_tl)	/* TL>0, so panic */
2887718SJason.Beloro@Sun.COM 					break;
2897718SJason.Beloro@Sun.COM 
290*8574SJason.Beloro@Sun.COM 				/* Panic on unknown context registers */
291*8574SJason.Beloro@Sun.COM 				if (errh_flt.errh_er.addr < MMU_PCONTEXT0 ||
292*8574SJason.Beloro@Sun.COM 				    errh_flt.errh_er.addr + errh_flt.errh_er.sz
293*8574SJason.Beloro@Sun.COM 				    > MMU_SCONTEXT1 + sizeof (uint64_t)) {
294*8574SJason.Beloro@Sun.COM 					cmn_err(CE_WARN, "Parity error on "
295*8574SJason.Beloro@Sun.COM 					    "unknown context register\n");
296*8574SJason.Beloro@Sun.COM 					aflt->flt_panic = 1;
297*8574SJason.Beloro@Sun.COM 					break;
298*8574SJason.Beloro@Sun.COM 				}
299*8574SJason.Beloro@Sun.COM 
3007718SJason.Beloro@Sun.COM 				u_kill = 0;		/* do not terminate */
3017718SJason.Beloro@Sun.COM 				break;
3027718SJason.Beloro@Sun.COM 			}
3037718SJason.Beloro@Sun.COM 			/*
3047718SJason.Beloro@Sun.COM 			 * All other PR_NRE fall through in order to
3057718SJason.Beloro@Sun.COM 			 * check for protection.  The list can include
3067718SJason.Beloro@Sun.COM 			 * ERRH_ATTR_FRF, ERRH_ATTR_IRF, ERRH_ATTR_MEM,
3077718SJason.Beloro@Sun.COM 			 * and ERRH_ATTR_PIO.
3080Sstevel@tonic-gate 			 */
3091457Swh94709 			/*FALLTHRU*/
3100Sstevel@tonic-gate 
3110Sstevel@tonic-gate 		case ERRH_DESC_DEF_NRE:
3120Sstevel@tonic-gate 			/*
3130Sstevel@tonic-gate 			 * If the trap occurred in privileged mode at TL=0,
3140Sstevel@tonic-gate 			 * we need to check to see if we were executing
3150Sstevel@tonic-gate 			 * in kernel under on_trap() or t_lofault
3161280Srf157361 			 * protection. If so, and if it was a PIO or MEM
3171280Srf157361 			 * error, then modify the saved registers so that
3181280Srf157361 			 * we return from the trap to the appropriate
3191280Srf157361 			 * trampoline routine.
3200Sstevel@tonic-gate 			 */
3211280Srf157361 			if (aflt->flt_priv == 1 && aflt->flt_tl == 0 &&
3221280Srf157361 			    ((errh_flt.errh_er.attr & ERRH_ATTR_PIO) ||
3231280Srf157361 			    (errh_flt.errh_er.attr & ERRH_ATTR_MEM))) {
3240Sstevel@tonic-gate 				trampolined =
3250Sstevel@tonic-gate 				    errh_error_protected(rp, aflt, &expected);
3261280Srf157361 			}
3270Sstevel@tonic-gate 
3280Sstevel@tonic-gate 			if (!aflt->flt_priv || aflt->flt_prot ==
3290Sstevel@tonic-gate 			    AFLT_PROT_COPY) {
3300Sstevel@tonic-gate 				aflt->flt_panic |= aft_panic;
3310Sstevel@tonic-gate 			} else if (!trampolined &&
3322477Srf157361 			    (aflt->flt_class != BUS_FAULT)) {
3330Sstevel@tonic-gate 				aflt->flt_panic = 1;
3340Sstevel@tonic-gate 			}
3350Sstevel@tonic-gate 
3360Sstevel@tonic-gate 			/*
3373156Sgirish 			 * Check error attribute, handle individual error
3383156Sgirish 			 * if it is needed.
3393156Sgirish 			 */
3403156Sgirish 			errh_handle_attr(&errh_flt);
3413156Sgirish 
3423156Sgirish 			/*
3430Sstevel@tonic-gate 			 * If PIO error, we need to query the bus nexus
3440Sstevel@tonic-gate 			 * for fatal errors.
3450Sstevel@tonic-gate 			 */
3460Sstevel@tonic-gate 			if (aflt->flt_class == BUS_FAULT) {
3477718SJason.Beloro@Sun.COM 				aflt->flt_addr = errh_flt.errh_er.addr;
3480Sstevel@tonic-gate 				errh_cpu_run_bus_error_handlers(aflt,
3490Sstevel@tonic-gate 				    expected);
3500Sstevel@tonic-gate 			}
3510Sstevel@tonic-gate 
3520Sstevel@tonic-gate 			break;
3530Sstevel@tonic-gate 
3543313Siskreen 		case ERRH_DESC_USER_DCORE:
3553313Siskreen 			/*
3563313Siskreen 			 * User generated panic. Call panic directly
3573313Siskreen 			 * since there are no FMA e-reports to
3583313Siskreen 			 * display.
3593313Siskreen 			 */
3603313Siskreen 
3613313Siskreen 			panic("Panic - Generated at user request");
3623313Siskreen 
3633313Siskreen 			break;
3643313Siskreen 
3650Sstevel@tonic-gate 		default:
3662218Swh94709 			cmn_err(CE_WARN, "Panic - Error Descriptor 0x%llx "
3672218Swh94709 			    " invalid in non-resumable error handler",
3680Sstevel@tonic-gate 			    (long long) errh_flt.errh_er.desc);
3692218Swh94709 			aflt->flt_panic = 1;
3702218Swh94709 			break;
3710Sstevel@tonic-gate 		}
3720Sstevel@tonic-gate 
3730Sstevel@tonic-gate 		/*
3740Sstevel@tonic-gate 		 * Queue the error report for further processing. If
3750Sstevel@tonic-gate 		 * flt_panic is set, code still process other errors
3760Sstevel@tonic-gate 		 * in the queue until the panic routine stops the
3770Sstevel@tonic-gate 		 * kernel.
3780Sstevel@tonic-gate 		 */
3790Sstevel@tonic-gate 		(void) cpu_queue_one_event(&errh_flt);
3800Sstevel@tonic-gate 
3810Sstevel@tonic-gate 		/*
3820Sstevel@tonic-gate 		 * Panic here if aflt->flt_panic has been set.
3830Sstevel@tonic-gate 		 * Enqueued errors will be logged as part of the panic flow.
3840Sstevel@tonic-gate 		 */
3850Sstevel@tonic-gate 		if (aflt->flt_panic) {
3860Sstevel@tonic-gate 			fm_panic("Unrecoverable hardware error");
3870Sstevel@tonic-gate 		}
3880Sstevel@tonic-gate 
3890Sstevel@tonic-gate 		/*
390917Selowe 		 * Call page_retire() to handle memory errors.
3910Sstevel@tonic-gate 		 */
3920Sstevel@tonic-gate 		if (errh_flt.errh_er.attr & ERRH_ATTR_MEM)
393917Selowe 			errh_page_retire(&errh_flt, PR_UE);
3940Sstevel@tonic-gate 
3950Sstevel@tonic-gate 		/*
3967718SJason.Beloro@Sun.COM 		 * If we queued an error for a thread that should terminate
3977718SJason.Beloro@Sun.COM 		 * and it was in user mode or protected by t_lofault, set AST
3987718SJason.Beloro@Sun.COM 		 * flag so the queue will be drained before returning to user
3997718SJason.Beloro@Sun.COM 		 * mode.  Note that user threads can be killed via pcb_flags.
4000Sstevel@tonic-gate 		 */
4017718SJason.Beloro@Sun.COM 		if (u_kill && (!aflt->flt_priv ||
4027718SJason.Beloro@Sun.COM 		    aflt->flt_prot == AFLT_PROT_COPY || u_spill_fill)) {
4030Sstevel@tonic-gate 			int pcb_flag = 0;
4040Sstevel@tonic-gate 
4050Sstevel@tonic-gate 			if (aflt->flt_class == CPU_FAULT)
4060Sstevel@tonic-gate 				pcb_flag |= ASYNC_HWERR;
4070Sstevel@tonic-gate 			else if (aflt->flt_class == BUS_FAULT)
4080Sstevel@tonic-gate 				pcb_flag |= ASYNC_BERR;
4090Sstevel@tonic-gate 
4100Sstevel@tonic-gate 			ttolwp(curthread)->lwp_pcb.pcb_flags |= pcb_flag;
4110Sstevel@tonic-gate 			aston(curthread);
4120Sstevel@tonic-gate 		}
4130Sstevel@tonic-gate 	}
4140Sstevel@tonic-gate }
4150Sstevel@tonic-gate 
4160Sstevel@tonic-gate /*
4170Sstevel@tonic-gate  * For PIO errors, this routine calls nexus driver's error
4180Sstevel@tonic-gate  * callback routines. If the callback routine returns fatal, and
4190Sstevel@tonic-gate  * we are in kernel or unknow mode without any error protection,
4200Sstevel@tonic-gate  * we need to turn on the panic flag.
4210Sstevel@tonic-gate  */
4220Sstevel@tonic-gate void
4230Sstevel@tonic-gate errh_cpu_run_bus_error_handlers(struct async_flt *aflt, int expected)
4240Sstevel@tonic-gate {
4250Sstevel@tonic-gate 	int status;
4260Sstevel@tonic-gate 	ddi_fm_error_t de;
4270Sstevel@tonic-gate 
4280Sstevel@tonic-gate 	bzero(&de, sizeof (ddi_fm_error_t));
4290Sstevel@tonic-gate 
4300Sstevel@tonic-gate 	de.fme_version = DDI_FME_VERSION;
4310Sstevel@tonic-gate 	de.fme_ena = fm_ena_generate(aflt->flt_id, FM_ENA_FMT1);
4320Sstevel@tonic-gate 	de.fme_flag = expected;
4330Sstevel@tonic-gate 	de.fme_bus_specific = (void *)aflt->flt_addr;
4340Sstevel@tonic-gate 	status = ndi_fm_handler_dispatch(ddi_root_node(), NULL, &de);
4350Sstevel@tonic-gate 
4360Sstevel@tonic-gate 	/*
4370Sstevel@tonic-gate 	 * If error is protected, it will jump to proper routine
4380Sstevel@tonic-gate 	 * to handle the handle; if it is in user level, we just
4390Sstevel@tonic-gate 	 * kill the user process; if the driver thinks the error is
4400Sstevel@tonic-gate 	 * not fatal, we can drive on. If none of above are true,
4410Sstevel@tonic-gate 	 * we panic
4420Sstevel@tonic-gate 	 */
4430Sstevel@tonic-gate 	if ((aflt->flt_prot == AFLT_PROT_NONE) && (aflt->flt_priv == 1) &&
4440Sstevel@tonic-gate 	    (status == DDI_FM_FATAL))
4450Sstevel@tonic-gate 		aflt->flt_panic = 1;
4460Sstevel@tonic-gate }
4470Sstevel@tonic-gate 
4480Sstevel@tonic-gate /*
4490Sstevel@tonic-gate  * This routine checks to see if we are under any error protection when
4500Sstevel@tonic-gate  * the error happens. If we are under error protection, we unwind to
4510Sstevel@tonic-gate  * the protection and indicate fault.
4520Sstevel@tonic-gate  */
4530Sstevel@tonic-gate static int
4540Sstevel@tonic-gate errh_error_protected(struct regs *rp, struct async_flt *aflt, int *expected)
4550Sstevel@tonic-gate {
4560Sstevel@tonic-gate 	int trampolined = 0;
4570Sstevel@tonic-gate 	ddi_acc_hdl_t *hp;
4580Sstevel@tonic-gate 
4590Sstevel@tonic-gate 	if (curthread->t_ontrap != NULL) {
4600Sstevel@tonic-gate 		on_trap_data_t *otp = curthread->t_ontrap;
4610Sstevel@tonic-gate 
4620Sstevel@tonic-gate 		if (otp->ot_prot & OT_DATA_EC) {
4630Sstevel@tonic-gate 			aflt->flt_prot = AFLT_PROT_EC;
4640Sstevel@tonic-gate 			otp->ot_trap |= OT_DATA_EC;
4650Sstevel@tonic-gate 			rp->r_pc = otp->ot_trampoline;
4660Sstevel@tonic-gate 			rp->r_npc = rp->r_pc +4;
4670Sstevel@tonic-gate 			trampolined = 1;
4680Sstevel@tonic-gate 		}
4690Sstevel@tonic-gate 
4700Sstevel@tonic-gate 		if (otp->ot_prot & OT_DATA_ACCESS) {
4710Sstevel@tonic-gate 			aflt->flt_prot = AFLT_PROT_ACCESS;
4720Sstevel@tonic-gate 			otp->ot_trap |= OT_DATA_ACCESS;
4730Sstevel@tonic-gate 			rp->r_pc = otp->ot_trampoline;
4740Sstevel@tonic-gate 			rp->r_npc = rp->r_pc + 4;
4750Sstevel@tonic-gate 			trampolined = 1;
4760Sstevel@tonic-gate 			/*
4770Sstevel@tonic-gate 			 * for peek and caut_gets
4780Sstevel@tonic-gate 			 * errors are expected
4790Sstevel@tonic-gate 			 */
4800Sstevel@tonic-gate 			hp = (ddi_acc_hdl_t *)otp->ot_handle;
4810Sstevel@tonic-gate 			if (!hp)
4820Sstevel@tonic-gate 				*expected = DDI_FM_ERR_PEEK;
4830Sstevel@tonic-gate 			else if (hp->ah_acc.devacc_attr_access ==
4840Sstevel@tonic-gate 			    DDI_CAUTIOUS_ACC)
4850Sstevel@tonic-gate 				*expected = DDI_FM_ERR_EXPECTED;
4860Sstevel@tonic-gate 		}
4870Sstevel@tonic-gate 	} else if (curthread->t_lofault) {
4880Sstevel@tonic-gate 		aflt->flt_prot = AFLT_PROT_COPY;
4890Sstevel@tonic-gate 		rp->r_g1 = EFAULT;
4900Sstevel@tonic-gate 		rp->r_pc = curthread->t_lofault;
4910Sstevel@tonic-gate 		rp->r_npc = rp->r_pc + 4;
4920Sstevel@tonic-gate 		trampolined = 1;
4930Sstevel@tonic-gate 	}
4940Sstevel@tonic-gate 
4950Sstevel@tonic-gate 	return (trampolined);
4960Sstevel@tonic-gate }
4970Sstevel@tonic-gate 
4980Sstevel@tonic-gate /*
4990Sstevel@tonic-gate  * Queue one event.
5000Sstevel@tonic-gate  */
5010Sstevel@tonic-gate static void
5020Sstevel@tonic-gate cpu_queue_one_event(errh_async_flt_t *errh_fltp)
5030Sstevel@tonic-gate {
5040Sstevel@tonic-gate 	struct async_flt *aflt = (struct async_flt *)errh_fltp;
5050Sstevel@tonic-gate 	errorq_t *eqp;
5060Sstevel@tonic-gate 
5070Sstevel@tonic-gate 	if (aflt->flt_panic)
5080Sstevel@tonic-gate 		eqp = ue_queue;
5090Sstevel@tonic-gate 	else
5100Sstevel@tonic-gate 		eqp = ce_queue;
5110Sstevel@tonic-gate 
5120Sstevel@tonic-gate 	errorq_dispatch(eqp, errh_fltp, sizeof (errh_async_flt_t),
5130Sstevel@tonic-gate 	    aflt->flt_panic);
5140Sstevel@tonic-gate }
5150Sstevel@tonic-gate 
5160Sstevel@tonic-gate /*
5170Sstevel@tonic-gate  * The cpu_async_log_err() function is called by the ce/ue_drain() function to
5180Sstevel@tonic-gate  * handle logging for CPU events that are dequeued.  As such, it can be invoked
5190Sstevel@tonic-gate  * from softint context, from AST processing in the trap() flow, or from the
5200Sstevel@tonic-gate  * panic flow.  We decode the CPU-specific data, and log appropriate messages.
5210Sstevel@tonic-gate  */
5220Sstevel@tonic-gate void
5230Sstevel@tonic-gate cpu_async_log_err(void *flt)
5240Sstevel@tonic-gate {
5250Sstevel@tonic-gate 	errh_async_flt_t *errh_fltp = (errh_async_flt_t *)flt;
5260Sstevel@tonic-gate 	errh_er_t *errh_erp = (errh_er_t *)&errh_fltp->errh_er;
5270Sstevel@tonic-gate 
5280Sstevel@tonic-gate 	switch (errh_erp->desc) {
5290Sstevel@tonic-gate 	case ERRH_DESC_UCOR_RE:
5300Sstevel@tonic-gate 		if (errh_erp->attr & ERRH_ATTR_MEM) {
5310Sstevel@tonic-gate 			/*
532917Selowe 			 * Turn on the PR_UE flag. The page will be
5330Sstevel@tonic-gate 			 * scrubbed when it is freed.
5340Sstevel@tonic-gate 			 */
535917Selowe 			errh_page_retire(errh_fltp, PR_UE);
5360Sstevel@tonic-gate 		}
5370Sstevel@tonic-gate 
5380Sstevel@tonic-gate 		break;
5390Sstevel@tonic-gate 
5400Sstevel@tonic-gate 	case ERRH_DESC_PR_NRE:
5410Sstevel@tonic-gate 	case ERRH_DESC_DEF_NRE:
5420Sstevel@tonic-gate 		if (errh_erp->attr & ERRH_ATTR_MEM) {
5430Sstevel@tonic-gate 			/*
5440Sstevel@tonic-gate 			 * For non-resumable memory error, retire
5450Sstevel@tonic-gate 			 * the page here.
5460Sstevel@tonic-gate 			 */
547917Selowe 			errh_page_retire(errh_fltp, PR_UE);
548639Swh94709 
549639Swh94709 			/*
550639Swh94709 			 * If we are going to panic, scrub the page first
551639Swh94709 			 */
552639Swh94709 			if (errh_fltp->cmn_asyncflt.flt_panic)
5537718SJason.Beloro@Sun.COM 				mem_scrub(errh_fltp->errh_er.addr,
554639Swh94709 				    errh_fltp->errh_er.sz);
5550Sstevel@tonic-gate 		}
5560Sstevel@tonic-gate 		break;
5570Sstevel@tonic-gate 
5580Sstevel@tonic-gate 	default:
5590Sstevel@tonic-gate 		break;
5600Sstevel@tonic-gate 	}
5610Sstevel@tonic-gate }
5620Sstevel@tonic-gate 
5630Sstevel@tonic-gate /*
5640Sstevel@tonic-gate  * Called from ce_drain().
5650Sstevel@tonic-gate  */
5660Sstevel@tonic-gate void
5670Sstevel@tonic-gate cpu_ce_log_err(struct async_flt *aflt)
5680Sstevel@tonic-gate {
5690Sstevel@tonic-gate 	switch (aflt->flt_class) {
5700Sstevel@tonic-gate 	case CPU_FAULT:
5710Sstevel@tonic-gate 		cpu_async_log_err(aflt);
5720Sstevel@tonic-gate 		break;
5730Sstevel@tonic-gate 
5740Sstevel@tonic-gate 	case BUS_FAULT:
5750Sstevel@tonic-gate 		cpu_async_log_err(aflt);
5760Sstevel@tonic-gate 		break;
5770Sstevel@tonic-gate 
5780Sstevel@tonic-gate 	default:
5790Sstevel@tonic-gate 		break;
5800Sstevel@tonic-gate 	}
5810Sstevel@tonic-gate }
5820Sstevel@tonic-gate 
5830Sstevel@tonic-gate /*
5840Sstevel@tonic-gate  * Called from ue_drain().
5850Sstevel@tonic-gate  */
5860Sstevel@tonic-gate void
5870Sstevel@tonic-gate cpu_ue_log_err(struct async_flt *aflt)
5880Sstevel@tonic-gate {
5890Sstevel@tonic-gate 	switch (aflt->flt_class) {
5900Sstevel@tonic-gate 	case CPU_FAULT:
5910Sstevel@tonic-gate 		cpu_async_log_err(aflt);
5920Sstevel@tonic-gate 		break;
5930Sstevel@tonic-gate 
5940Sstevel@tonic-gate 	case BUS_FAULT:
5950Sstevel@tonic-gate 		cpu_async_log_err(aflt);
5960Sstevel@tonic-gate 		break;
5970Sstevel@tonic-gate 
5980Sstevel@tonic-gate 	default:
5990Sstevel@tonic-gate 		break;
6000Sstevel@tonic-gate 	}
6010Sstevel@tonic-gate }
6020Sstevel@tonic-gate 
6030Sstevel@tonic-gate /*
6040Sstevel@tonic-gate  * Turn on flag on the error memory region.
6050Sstevel@tonic-gate  */
6060Sstevel@tonic-gate static void
607917Selowe errh_page_retire(errh_async_flt_t *errh_fltp, uchar_t flag)
6080Sstevel@tonic-gate {
6097718SJason.Beloro@Sun.COM 	uint64_t flt_real_addr_start = errh_fltp->errh_er.addr;
6100Sstevel@tonic-gate 	uint64_t flt_real_addr_end = flt_real_addr_start +
6110Sstevel@tonic-gate 	    errh_fltp->errh_er.sz - 1;
6120Sstevel@tonic-gate 	int64_t current_addr;
6130Sstevel@tonic-gate 
6140Sstevel@tonic-gate 	if (errh_fltp->errh_er.sz == 0)
6150Sstevel@tonic-gate 		return;
6160Sstevel@tonic-gate 
6170Sstevel@tonic-gate 	for (current_addr = flt_real_addr_start;
6180Sstevel@tonic-gate 	    current_addr < flt_real_addr_end; current_addr += MMU_PAGESIZE) {
619917Selowe 		(void) page_retire(current_addr, flag);
6200Sstevel@tonic-gate 	}
6210Sstevel@tonic-gate }
6220Sstevel@tonic-gate 
6230Sstevel@tonic-gate void
6240Sstevel@tonic-gate mem_scrub(uint64_t paddr, uint64_t len)
6250Sstevel@tonic-gate {
6260Sstevel@tonic-gate 	uint64_t pa, length, scrubbed_len;
6270Sstevel@tonic-gate 
6280Sstevel@tonic-gate 	pa = paddr;
6290Sstevel@tonic-gate 	length = len;
6300Sstevel@tonic-gate 	scrubbed_len = 0;
6310Sstevel@tonic-gate 
632639Swh94709 	while (length > 0) {
633639Swh94709 		if (hv_mem_scrub(pa, length, &scrubbed_len) != H_EOK)
6340Sstevel@tonic-gate 			break;
6350Sstevel@tonic-gate 
6360Sstevel@tonic-gate 		pa += scrubbed_len;
6370Sstevel@tonic-gate 		length -= scrubbed_len;
6380Sstevel@tonic-gate 	}
6390Sstevel@tonic-gate }
6400Sstevel@tonic-gate 
6411457Swh94709 /*
6423199Sep32863  * Call hypervisor to flush the memory region.
6433199Sep32863  * Both va and len must be MMU_PAGESIZE aligned.
6443199Sep32863  * Returns the total number of bytes flushed.
6451457Swh94709  */
6463199Sep32863 uint64_t
6473729Swh94709 mem_sync(caddr_t orig_va, size_t orig_len)
6480Sstevel@tonic-gate {
6490Sstevel@tonic-gate 	uint64_t pa, length, flushed;
6503199Sep32863 	uint64_t chunk_len = MMU_PAGESIZE;
6513199Sep32863 	uint64_t total_flushed = 0;
6523729Swh94709 	uint64_t va, len;
6530Sstevel@tonic-gate 
6543729Swh94709 	if (orig_len == 0)
6553199Sep32863 		return (total_flushed);
6560Sstevel@tonic-gate 
6573729Swh94709 	/* align va */
6583729Swh94709 	va = P2ALIGN_TYPED(orig_va, MMU_PAGESIZE, uint64_t);
6593729Swh94709 	/* round up len to MMU_PAGESIZE aligned */
6603729Swh94709 	len = P2ROUNDUP_TYPED(orig_va + orig_len, MMU_PAGESIZE, uint64_t) - va;
6613729Swh94709 
6623199Sep32863 	while (len > 0) {
6633199Sep32863 		pa = va_to_pa((caddr_t)va);
6643199Sep32863 		if (pa == (uint64_t)-1)
6653199Sep32863 			return (total_flushed);
6660Sstevel@tonic-gate 
6673199Sep32863 		length = chunk_len;
6683199Sep32863 		flushed = 0;
6691457Swh94709 
6703199Sep32863 		while (length > 0) {
6713199Sep32863 			if (hv_mem_sync(pa, length, &flushed) != H_EOK)
6723199Sep32863 				return (total_flushed);
6730Sstevel@tonic-gate 
6743199Sep32863 			pa += flushed;
6753199Sep32863 			length -= flushed;
6763199Sep32863 			total_flushed += flushed;
6773199Sep32863 		}
6780Sstevel@tonic-gate 
6793199Sep32863 		va += chunk_len;
6803199Sep32863 		len -= chunk_len;
6810Sstevel@tonic-gate 	}
6823199Sep32863 
6833199Sep32863 	return (total_flushed);
6840Sstevel@tonic-gate }
6850Sstevel@tonic-gate 
6860Sstevel@tonic-gate /*
6870Sstevel@tonic-gate  * If resumable queue is full, we need to check if any cpu is in
6880Sstevel@tonic-gate  * error state. If not, we drive on. If yes, we need to panic. The
6890Sstevel@tonic-gate  * hypervisor call hv_cpu_state() is being used for checking the
6903750Srf157361  * cpu state.  And reset %tick_compr in case tick-compare was lost.
6910Sstevel@tonic-gate  */
6920Sstevel@tonic-gate static void
6930Sstevel@tonic-gate errh_rq_full(struct async_flt *afltp)
6940Sstevel@tonic-gate {
6950Sstevel@tonic-gate 	processorid_t who;
6960Sstevel@tonic-gate 	uint64_t cpu_state;
6970Sstevel@tonic-gate 	uint64_t retval;
6983750Srf157361 	uint64_t current_tick;
6993750Srf157361 
7003750Srf157361 	current_tick = (uint64_t)gettick();
7013750Srf157361 	tickcmpr_set(current_tick);
7020Sstevel@tonic-gate 
7030Sstevel@tonic-gate 	for (who = 0; who < NCPU; who++)
7040Sstevel@tonic-gate 		if (CPU_IN_SET(cpu_ready_set, who)) {
7050Sstevel@tonic-gate 			retval = hv_cpu_state(who, &cpu_state);
7060Sstevel@tonic-gate 			if (retval != H_EOK || cpu_state == CPU_STATE_ERROR) {
7070Sstevel@tonic-gate 				afltp->flt_panic = 1;
7080Sstevel@tonic-gate 				break;
7090Sstevel@tonic-gate 			}
7100Sstevel@tonic-gate 		}
7110Sstevel@tonic-gate }
7120Sstevel@tonic-gate 
7130Sstevel@tonic-gate /*
7140Sstevel@tonic-gate  * Return processor specific async error structure
7150Sstevel@tonic-gate  * size used.
7160Sstevel@tonic-gate  */
7170Sstevel@tonic-gate int
7180Sstevel@tonic-gate cpu_aflt_size(void)
7190Sstevel@tonic-gate {
7200Sstevel@tonic-gate 	return (sizeof (errh_async_flt_t));
7210Sstevel@tonic-gate }
7220Sstevel@tonic-gate 
7230Sstevel@tonic-gate #define	SZ_TO_ETRS_SHIFT	6
7240Sstevel@tonic-gate 
7250Sstevel@tonic-gate /*
7260Sstevel@tonic-gate  * Message print out when resumable queue is overflown
7270Sstevel@tonic-gate  */
7280Sstevel@tonic-gate /*ARGSUSED*/
7290Sstevel@tonic-gate void
7300Sstevel@tonic-gate rq_overflow(struct regs *rp, uint64_t head_offset,
7310Sstevel@tonic-gate     uint64_t tail_offset)
7320Sstevel@tonic-gate {
7330Sstevel@tonic-gate 	rq_overflow_count++;
7340Sstevel@tonic-gate }
7350Sstevel@tonic-gate 
7360Sstevel@tonic-gate /*
7370Sstevel@tonic-gate  * Handler to process a fatal error.  This routine can be called from a
7380Sstevel@tonic-gate  * softint, called from trap()'s AST handling, or called from the panic flow.
7390Sstevel@tonic-gate  */
7400Sstevel@tonic-gate /*ARGSUSED*/
7410Sstevel@tonic-gate static void
7420Sstevel@tonic-gate ue_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep)
7430Sstevel@tonic-gate {
7440Sstevel@tonic-gate 	cpu_ue_log_err(aflt);
7450Sstevel@tonic-gate }
7460Sstevel@tonic-gate 
7470Sstevel@tonic-gate /*
7480Sstevel@tonic-gate  * Handler to process a correctable error.  This routine can be called from a
7490Sstevel@tonic-gate  * softint.  We just call the CPU module's logging routine.
7500Sstevel@tonic-gate  */
7510Sstevel@tonic-gate /*ARGSUSED*/
7520Sstevel@tonic-gate static void
7530Sstevel@tonic-gate ce_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep)
7540Sstevel@tonic-gate {
7550Sstevel@tonic-gate 	cpu_ce_log_err(aflt);
7560Sstevel@tonic-gate }
7570Sstevel@tonic-gate 
7580Sstevel@tonic-gate /*
759541Srf157361  * Handler to process vbsc hostshutdown (power-off button).
760541Srf157361  */
761541Srf157361 static int
762541Srf157361 err_shutdown_softintr()
763541Srf157361 {
764541Srf157361 	cmn_err(CE_WARN, "Power-off requested, system will now shutdown.");
765541Srf157361 	do_shutdown();
766541Srf157361 
767541Srf157361 	/*
768541Srf157361 	 * just in case do_shutdown() fails
769541Srf157361 	 */
770541Srf157361 	(void) timeout((void(*)(void *))power_down, NULL, 100 * hz);
771541Srf157361 	return (DDI_INTR_CLAIMED);
772541Srf157361 }
773541Srf157361 
774541Srf157361 /*
7750Sstevel@tonic-gate  * Allocate error queue sizes based on max_ncpus.  max_ncpus is set just
7760Sstevel@tonic-gate  * after ncpunode has been determined.  ncpus is set in start_other_cpus
7770Sstevel@tonic-gate  * which is called after error_init() but may change dynamically.
7780Sstevel@tonic-gate  */
7790Sstevel@tonic-gate void
7800Sstevel@tonic-gate error_init(void)
7810Sstevel@tonic-gate {
7820Sstevel@tonic-gate 	char tmp_name[MAXSYSNAME];
783789Sahrens 	pnode_t node;
7840Sstevel@tonic-gate 	size_t size = cpu_aflt_size();
7850Sstevel@tonic-gate 
7860Sstevel@tonic-gate 	/*
7870Sstevel@tonic-gate 	 * Initialize the correctable and uncorrectable error queues.
7880Sstevel@tonic-gate 	 */
7890Sstevel@tonic-gate 	ue_queue = errorq_create("ue_queue", (errorq_func_t)ue_drain, NULL,
7900Sstevel@tonic-gate 	    MAX_ASYNC_FLTS * (max_ncpus + 1), size, PIL_2, ERRORQ_VITAL);
7910Sstevel@tonic-gate 
7920Sstevel@tonic-gate 	ce_queue = errorq_create("ce_queue", (errorq_func_t)ce_drain, NULL,
7930Sstevel@tonic-gate 	    MAX_CE_FLTS * (max_ncpus + 1), size, PIL_1, 0);
7940Sstevel@tonic-gate 
7950Sstevel@tonic-gate 	if (ue_queue == NULL || ce_queue == NULL)
7960Sstevel@tonic-gate 		panic("failed to create required system error queue");
7970Sstevel@tonic-gate 
7980Sstevel@tonic-gate 	/*
799541Srf157361 	 * Setup interrupt handler for power-off button.
800541Srf157361 	 */
801541Srf157361 	err_shutdown_inum = add_softintr(PIL_9,
8022973Sgovinda 	    (softintrfunc)err_shutdown_softintr, NULL, SOFTINT_ST);
803541Srf157361 
804541Srf157361 	/*
8050Sstevel@tonic-gate 	 * Initialize the busfunc list mutex.  This must be a PIL_15 spin lock
8060Sstevel@tonic-gate 	 * because we will need to acquire it from cpu_async_error().
8070Sstevel@tonic-gate 	 */
8080Sstevel@tonic-gate 	mutex_init(&bfd_lock, NULL, MUTEX_SPIN, (void *)PIL_15);
8090Sstevel@tonic-gate 
8104612Srf157361 	/* Only allow one cpu at a time to dump errh errors. */
8114612Srf157361 	mutex_init(&errh_print_lock, NULL, MUTEX_SPIN, (void *)PIL_15);
8124612Srf157361 
8130Sstevel@tonic-gate 	node = prom_rootnode();
8140Sstevel@tonic-gate 	if ((node == OBP_NONODE) || (node == OBP_BADNODE)) {
8150Sstevel@tonic-gate 		cmn_err(CE_CONT, "error_init: node 0x%x\n", (uint_t)node);
8160Sstevel@tonic-gate 		return;
8170Sstevel@tonic-gate 	}
8180Sstevel@tonic-gate 
8190Sstevel@tonic-gate 	if (((size = prom_getproplen(node, "reset-reason")) != -1) &&
8200Sstevel@tonic-gate 	    (size <= MAXSYSNAME) &&
8210Sstevel@tonic-gate 	    (prom_getprop(node, "reset-reason", tmp_name) != -1)) {
8220Sstevel@tonic-gate 		if (reset_debug) {
8230Sstevel@tonic-gate 			cmn_err(CE_CONT, "System booting after %s\n", tmp_name);
8240Sstevel@tonic-gate 		} else if (strncmp(tmp_name, "FATAL", 5) == 0) {
8250Sstevel@tonic-gate 			cmn_err(CE_CONT,
8260Sstevel@tonic-gate 			    "System booting after fatal error %s\n", tmp_name);
8270Sstevel@tonic-gate 		}
8280Sstevel@tonic-gate 	}
8290Sstevel@tonic-gate }
830817Swh94709 
831817Swh94709 /*
832817Swh94709  * Nonresumable queue is full, panic here
833817Swh94709  */
834817Swh94709 /*ARGSUSED*/
835817Swh94709 void
836817Swh94709 nrq_overflow(struct regs *rp)
837817Swh94709 {
838817Swh94709 	fm_panic("Nonresumable queue full");
839817Swh94709 }
8403156Sgirish 
8413156Sgirish /*
8423156Sgirish  * This is the place for special error handling for individual errors.
8433156Sgirish  */
8443156Sgirish static void
8453156Sgirish errh_handle_attr(errh_async_flt_t *errh_fltp)
8463156Sgirish {
8473156Sgirish 	switch (errh_fltp->errh_er.attr & ~ERRH_MODE_MASK) {
8483156Sgirish 	case ERRH_ATTR_CPU:
8493156Sgirish 	case ERRH_ATTR_MEM:
8503156Sgirish 	case ERRH_ATTR_PIO:
8513156Sgirish 	case ERRH_ATTR_IRF:
8523156Sgirish 	case ERRH_ATTR_FRF:
8533156Sgirish 	case ERRH_ATTR_SHUT:
8543156Sgirish 		break;
8553156Sgirish 
8563156Sgirish 	case ERRH_ATTR_ASR:
8573156Sgirish 		errh_handle_asr(errh_fltp);
8583156Sgirish 		break;
8593156Sgirish 
8603156Sgirish 	case ERRH_ATTR_ASI:
8613156Sgirish 	case ERRH_ATTR_PREG:
8623156Sgirish 	case ERRH_ATTR_RQF:
8633156Sgirish 		break;
8643156Sgirish 
8653156Sgirish 	default:
8663156Sgirish 		break;
8673156Sgirish 	}
8683156Sgirish }
8693156Sgirish 
8703156Sgirish /*
8713156Sgirish  * Handle ASR bit set in ATTR
8723156Sgirish  */
8733156Sgirish static void
8743156Sgirish errh_handle_asr(errh_async_flt_t *errh_fltp)
8753156Sgirish {
8763156Sgirish 	uint64_t current_tick;
8773156Sgirish 
8783156Sgirish 	switch (errh_fltp->errh_er.reg) {
8793156Sgirish 	case ASR_REG_VALID | ASR_REG_TICK:
8803156Sgirish 		/*
8813156Sgirish 		 * For Tick Compare Register error, it only happens when
8823156Sgirish 		 * the register is being read or compared with the %tick
8833156Sgirish 		 * register. Since we lost the contents of the register,
8843156Sgirish 		 * we set the %tick_compr in the future. An interrupt will
8853156Sgirish 		 * happen when %tick matches the value field of %tick_compr.
8863156Sgirish 		 */
8873156Sgirish 		current_tick = (uint64_t)gettick();
8883156Sgirish 		tickcmpr_set(current_tick);
8893156Sgirish 		/* Do not panic */
8903156Sgirish 		errh_fltp->cmn_asyncflt.flt_panic = 0;
8913156Sgirish 		break;
8923156Sgirish 
8933156Sgirish 	default:
8943156Sgirish 		break;
8953156Sgirish 	}
8963156Sgirish }
8974612Srf157361 
8984612Srf157361 /*
8994612Srf157361  * Dump the error packet
9004612Srf157361  */
9014612Srf157361 /*ARGSUSED*/
9024612Srf157361 static void
9034612Srf157361 errh_er_print(errh_er_t *errh_erp, const char *queue)
9044612Srf157361 {
9054612Srf157361 	typedef union {
9064612Srf157361 		uint64_t w;
9074612Srf157361 		uint16_t s[4];
9084612Srf157361 	} errhp_t;
9094612Srf157361 	errhp_t *p = (errhp_t *)errh_erp;
9104612Srf157361 	int i;
9114612Srf157361 
9124612Srf157361 	mutex_enter(&errh_print_lock);
9134612Srf157361 	switch (errh_erp->desc) {
9144612Srf157361 	case ERRH_DESC_UCOR_RE:
9154612Srf157361 		cmn_err(CE_CONT, "\nResumable Uncorrectable Error ");
9164612Srf157361 		break;
9174612Srf157361 	case ERRH_DESC_PR_NRE:
9184612Srf157361 		cmn_err(CE_CONT, "\nNonresumable Precise Error ");
9194612Srf157361 		break;
9204612Srf157361 	case ERRH_DESC_DEF_NRE:
9214612Srf157361 		cmn_err(CE_CONT, "\nNonresumable Deferred Error ");
9224612Srf157361 		break;
9234612Srf157361 	default:
9244612Srf157361 		cmn_err(CE_CONT, "\nError packet ");
9254612Srf157361 		break;
9264612Srf157361 	}
9274612Srf157361 	cmn_err(CE_CONT, "received on %s\n", queue);
9284612Srf157361 
9294612Srf157361 	/*
9304612Srf157361 	 * Print Q_ENTRY_SIZE bytes of epacket with 8 bytes per line
9314612Srf157361 	 */
9324612Srf157361 	for (i = Q_ENTRY_SIZE; i > 0; i -= 8, ++p) {
9334612Srf157361 		cmn_err(CE_CONT, "%016lx: %04x %04x %04x %04x\n", (uint64_t)p,
9344612Srf157361 		    p->s[0], p->s[1], p->s[2], p->s[3]);
9354612Srf157361 	}
9364612Srf157361 	mutex_exit(&errh_print_lock);
9374612Srf157361 }
938