xref: /onnv-gate/usr/src/uts/sun4v/os/error.c (revision 0)
1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate  * CDDL HEADER START
3*0Sstevel@tonic-gate  *
4*0Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*0Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*0Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*0Sstevel@tonic-gate  * with the License.
8*0Sstevel@tonic-gate  *
9*0Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*0Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*0Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*0Sstevel@tonic-gate  * and limitations under the License.
13*0Sstevel@tonic-gate  *
14*0Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*0Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*0Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*0Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*0Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*0Sstevel@tonic-gate  *
20*0Sstevel@tonic-gate  * CDDL HEADER END
21*0Sstevel@tonic-gate  */
22*0Sstevel@tonic-gate /*
23*0Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*0Sstevel@tonic-gate  * Use is subject to license terms.
25*0Sstevel@tonic-gate  */
26*0Sstevel@tonic-gate 
27*0Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*0Sstevel@tonic-gate 
29*0Sstevel@tonic-gate #include <sys/types.h>
30*0Sstevel@tonic-gate #include <sys/machsystm.h>
31*0Sstevel@tonic-gate #include <sys/cpuvar.h>
32*0Sstevel@tonic-gate #include <sys/async.h>
33*0Sstevel@tonic-gate #include <sys/ontrap.h>
34*0Sstevel@tonic-gate #include <sys/ddifm.h>
35*0Sstevel@tonic-gate #include <sys/hypervisor_api.h>
36*0Sstevel@tonic-gate #include <sys/errorq.h>
37*0Sstevel@tonic-gate #include <sys/promif.h>
38*0Sstevel@tonic-gate #include <sys/prom_plat.h>
39*0Sstevel@tonic-gate #include <sys/x_call.h>
40*0Sstevel@tonic-gate #include <sys/error.h>
41*0Sstevel@tonic-gate #include <sys/fm/util.h>
42*0Sstevel@tonic-gate 
43*0Sstevel@tonic-gate #define	MAX_CE_FLTS		10
44*0Sstevel@tonic-gate #define	MAX_ASYNC_FLTS		6
45*0Sstevel@tonic-gate 
46*0Sstevel@tonic-gate errorq_t *ue_queue;			/* queue of uncorrectable errors */
47*0Sstevel@tonic-gate errorq_t *ce_queue;			/* queue of correctable errors */
48*0Sstevel@tonic-gate 
49*0Sstevel@tonic-gate /*
50*0Sstevel@tonic-gate  * Being used by memory test driver.
51*0Sstevel@tonic-gate  * ce_verbose_memory - covers CEs in DIMMs
52*0Sstevel@tonic-gate  * ce_verbose_other - covers "others" (ecache, IO, etc.)
53*0Sstevel@tonic-gate  *
54*0Sstevel@tonic-gate  * If the value is 0, nothing is logged.
55*0Sstevel@tonic-gate  * If the value is 1, the error is logged to the log file, but not console.
56*0Sstevel@tonic-gate  * If the value is 2, the error is logged to the log file and console.
57*0Sstevel@tonic-gate  */
58*0Sstevel@tonic-gate int	ce_verbose_memory = 1;
59*0Sstevel@tonic-gate int	ce_verbose_other = 1;
60*0Sstevel@tonic-gate 
61*0Sstevel@tonic-gate int	ce_show_data = 0;
62*0Sstevel@tonic-gate int	ce_debug = 0;
63*0Sstevel@tonic-gate int	ue_debug = 0;
64*0Sstevel@tonic-gate int	reset_debug = 0;
65*0Sstevel@tonic-gate 
66*0Sstevel@tonic-gate /*
67*0Sstevel@tonic-gate  * Tunables for controlling the handling of asynchronous faults (AFTs). Setting
68*0Sstevel@tonic-gate  * these to non-default values on a non-DEBUG kernel is NOT supported.
69*0Sstevel@tonic-gate  */
70*0Sstevel@tonic-gate int	aft_verbose = 0;	/* log AFT messages > 1 to log only */
71*0Sstevel@tonic-gate int	aft_panic = 0;		/* panic (not reboot) on fatal usermode AFLT */
72*0Sstevel@tonic-gate int	aft_testfatal = 0;	/* force all AFTs to panic immediately */
73*0Sstevel@tonic-gate 
74*0Sstevel@tonic-gate /*
75*0Sstevel@tonic-gate  * Defined in bus_func.c but initialised in error_init
76*0Sstevel@tonic-gate  */
77*0Sstevel@tonic-gate extern kmutex_t bfd_lock;
78*0Sstevel@tonic-gate 
79*0Sstevel@tonic-gate static uint32_t rq_overflow_count = 0;		/* counter for rq overflow */
80*0Sstevel@tonic-gate 
81*0Sstevel@tonic-gate static void cpu_queue_one_event(errh_async_flt_t *);
82*0Sstevel@tonic-gate static uint32_t count_entries_on_queue(uint64_t, uint64_t, uint32_t);
83*0Sstevel@tonic-gate static void errh_page_settoxic(errh_async_flt_t *, uchar_t);
84*0Sstevel@tonic-gate static void errh_page_retire(errh_async_flt_t *);
85*0Sstevel@tonic-gate static int errh_error_protected(struct regs *, struct async_flt *, int *);
86*0Sstevel@tonic-gate static void errh_rq_full(struct async_flt *);
87*0Sstevel@tonic-gate static void ue_drain(void *, struct async_flt *, errorq_elem_t *);
88*0Sstevel@tonic-gate static void ce_drain(void *, struct async_flt *, errorq_elem_t *);
89*0Sstevel@tonic-gate 
90*0Sstevel@tonic-gate /*ARGSUSED*/
91*0Sstevel@tonic-gate void
92*0Sstevel@tonic-gate process_resumable_error(struct regs *rp, uint32_t head_offset,
93*0Sstevel@tonic-gate     uint32_t tail_offset)
94*0Sstevel@tonic-gate {
95*0Sstevel@tonic-gate 	struct machcpu *mcpup;
96*0Sstevel@tonic-gate 	struct async_flt *aflt;
97*0Sstevel@tonic-gate 	errh_async_flt_t errh_flt;
98*0Sstevel@tonic-gate 	errh_er_t *head_va;
99*0Sstevel@tonic-gate 
100*0Sstevel@tonic-gate 	mcpup = &(CPU->cpu_m);
101*0Sstevel@tonic-gate 
102*0Sstevel@tonic-gate 	while (head_offset != tail_offset) {
103*0Sstevel@tonic-gate 		/* kernel buffer starts right after the resumable queue */
104*0Sstevel@tonic-gate 		head_va = (errh_er_t *)(mcpup->cpu_rq_va + head_offset +
105*0Sstevel@tonic-gate 		    CPU_RQ_SIZE);
106*0Sstevel@tonic-gate 		/* Copy the error report to local buffer */
107*0Sstevel@tonic-gate 		bzero(&errh_flt, sizeof (errh_async_flt_t));
108*0Sstevel@tonic-gate 		bcopy((char *)head_va, &(errh_flt.errh_er),
109*0Sstevel@tonic-gate 		    sizeof (errh_er_t));
110*0Sstevel@tonic-gate 
111*0Sstevel@tonic-gate 		/* Increment the queue head */
112*0Sstevel@tonic-gate 		head_offset += Q_ENTRY_SIZE;
113*0Sstevel@tonic-gate 		/* Wrap around */
114*0Sstevel@tonic-gate 		head_offset &= (CPU_RQ_SIZE - 1);
115*0Sstevel@tonic-gate 
116*0Sstevel@tonic-gate 		/* set error handle to zero so it can hold new error report */
117*0Sstevel@tonic-gate 		head_va->ehdl = 0;
118*0Sstevel@tonic-gate 
119*0Sstevel@tonic-gate 		switch (errh_flt.errh_er.desc) {
120*0Sstevel@tonic-gate 		case ERRH_DESC_UCOR_RE:
121*0Sstevel@tonic-gate 			break;
122*0Sstevel@tonic-gate 
123*0Sstevel@tonic-gate 		default:
124*0Sstevel@tonic-gate 			cmn_err(CE_WARN, "Error Descriptor 0x%llx "
125*0Sstevel@tonic-gate 			    " invalid in resumable error handler",
126*0Sstevel@tonic-gate 			    (long long) errh_flt.errh_er.desc);
127*0Sstevel@tonic-gate 			continue;
128*0Sstevel@tonic-gate 		}
129*0Sstevel@tonic-gate 
130*0Sstevel@tonic-gate 		aflt = (struct async_flt *)&(errh_flt.cmn_asyncflt);
131*0Sstevel@tonic-gate 		aflt->flt_id = gethrtime();
132*0Sstevel@tonic-gate 		aflt->flt_bus_id = getprocessorid();
133*0Sstevel@tonic-gate 		aflt->flt_class = CPU_FAULT;
134*0Sstevel@tonic-gate 		aflt->flt_prot = AFLT_PROT_NONE;
135*0Sstevel@tonic-gate 		aflt->flt_priv = (((errh_flt.errh_er.attr & ERRH_MODE_MASK)
136*0Sstevel@tonic-gate 		    >> ERRH_MODE_SHIFT) == ERRH_MODE_PRIV);
137*0Sstevel@tonic-gate 
138*0Sstevel@tonic-gate 		if (errh_flt.errh_er.attr & ERRH_ATTR_CPU)
139*0Sstevel@tonic-gate 			/* If it is an error on other cpu */
140*0Sstevel@tonic-gate 			aflt->flt_panic = 1;
141*0Sstevel@tonic-gate 		else
142*0Sstevel@tonic-gate 			aflt->flt_panic = 0;
143*0Sstevel@tonic-gate 
144*0Sstevel@tonic-gate 		/*
145*0Sstevel@tonic-gate 		 * Handle resumable queue full case.
146*0Sstevel@tonic-gate 		 */
147*0Sstevel@tonic-gate 		if (errh_flt.errh_er.attr & ERRH_ATTR_RQF) {
148*0Sstevel@tonic-gate 			(void) errh_rq_full(aflt);
149*0Sstevel@tonic-gate 		}
150*0Sstevel@tonic-gate 
151*0Sstevel@tonic-gate 		/*
152*0Sstevel@tonic-gate 		 * Queue the error on ce or ue queue depend on flt_panic.
153*0Sstevel@tonic-gate 		 * Even if flt_panic is set, the code still keep processing
154*0Sstevel@tonic-gate 		 * the rest element on rq until the panic starts.
155*0Sstevel@tonic-gate 		 */
156*0Sstevel@tonic-gate 		(void) cpu_queue_one_event(&errh_flt);
157*0Sstevel@tonic-gate 
158*0Sstevel@tonic-gate 		/*
159*0Sstevel@tonic-gate 		 * Panic here if aflt->flt_panic has been set.
160*0Sstevel@tonic-gate 		 * Enqueued errors will be logged as part of the panic flow.
161*0Sstevel@tonic-gate 		 */
162*0Sstevel@tonic-gate 		if (aflt->flt_panic) {
163*0Sstevel@tonic-gate 			fm_panic("Unrecoverable error on another CPU");
164*0Sstevel@tonic-gate 		}
165*0Sstevel@tonic-gate 	}
166*0Sstevel@tonic-gate }
167*0Sstevel@tonic-gate 
168*0Sstevel@tonic-gate void
169*0Sstevel@tonic-gate process_nonresumable_error(struct regs *rp, uint64_t tl,
170*0Sstevel@tonic-gate     uint32_t head_offset, uint32_t tail_offset)
171*0Sstevel@tonic-gate {
172*0Sstevel@tonic-gate 	struct machcpu *mcpup;
173*0Sstevel@tonic-gate 	struct async_flt *aflt;
174*0Sstevel@tonic-gate 	errh_async_flt_t errh_flt;
175*0Sstevel@tonic-gate 	errh_er_t *head_va;
176*0Sstevel@tonic-gate 	int trampolined = 0;
177*0Sstevel@tonic-gate 	int expected = DDI_FM_ERR_UNEXPECTED;
178*0Sstevel@tonic-gate 	uint64_t exec_mode;
179*0Sstevel@tonic-gate 
180*0Sstevel@tonic-gate 	mcpup = &(CPU->cpu_m);
181*0Sstevel@tonic-gate 
182*0Sstevel@tonic-gate 	while (head_offset != tail_offset) {
183*0Sstevel@tonic-gate 		/* kernel buffer starts right after the nonresumable queue */
184*0Sstevel@tonic-gate 		head_va = (errh_er_t *)(mcpup->cpu_nrq_va + head_offset +
185*0Sstevel@tonic-gate 		    CPU_NRQ_SIZE);
186*0Sstevel@tonic-gate 
187*0Sstevel@tonic-gate 		/* Copy the error report to local buffer */
188*0Sstevel@tonic-gate 		bzero(&errh_flt, sizeof (errh_async_flt_t));
189*0Sstevel@tonic-gate 
190*0Sstevel@tonic-gate 		bcopy((char *)head_va, &(errh_flt.errh_er),
191*0Sstevel@tonic-gate 		    sizeof (errh_er_t));
192*0Sstevel@tonic-gate 
193*0Sstevel@tonic-gate 		/* Increment the queue head */
194*0Sstevel@tonic-gate 		head_offset += Q_ENTRY_SIZE;
195*0Sstevel@tonic-gate 		/* Wrap around */
196*0Sstevel@tonic-gate 		head_offset &= (CPU_NRQ_SIZE - 1);
197*0Sstevel@tonic-gate 
198*0Sstevel@tonic-gate 		/* set error handle to zero so it can hold new error report */
199*0Sstevel@tonic-gate 		head_va->ehdl = 0;
200*0Sstevel@tonic-gate 
201*0Sstevel@tonic-gate 		aflt = (struct async_flt *)&(errh_flt.cmn_asyncflt);
202*0Sstevel@tonic-gate 
203*0Sstevel@tonic-gate 		trampolined = 0;
204*0Sstevel@tonic-gate 
205*0Sstevel@tonic-gate 		if (errh_flt.errh_er.attr & ERRH_ATTR_PIO)
206*0Sstevel@tonic-gate 			aflt->flt_class = BUS_FAULT;
207*0Sstevel@tonic-gate 		else
208*0Sstevel@tonic-gate 			aflt->flt_class = CPU_FAULT;
209*0Sstevel@tonic-gate 
210*0Sstevel@tonic-gate 		aflt->flt_id = gethrtime();
211*0Sstevel@tonic-gate 		aflt->flt_bus_id = getprocessorid();
212*0Sstevel@tonic-gate 		aflt->flt_pc = (caddr_t)rp->r_pc;
213*0Sstevel@tonic-gate 		exec_mode = (errh_flt.errh_er.attr & ERRH_MODE_MASK)
214*0Sstevel@tonic-gate 		    >> ERRH_MODE_SHIFT;
215*0Sstevel@tonic-gate 		aflt->flt_priv = (exec_mode == ERRH_MODE_PRIV ||
216*0Sstevel@tonic-gate 		    exec_mode == ERRH_MODE_UNKNOWN);
217*0Sstevel@tonic-gate 		aflt->flt_tl = (uchar_t)tl;
218*0Sstevel@tonic-gate 		aflt->flt_prot = AFLT_PROT_NONE;
219*0Sstevel@tonic-gate 		aflt->flt_panic = ((aflt->flt_tl != 0) ||
220*0Sstevel@tonic-gate 		    (aft_testfatal != 0));
221*0Sstevel@tonic-gate 
222*0Sstevel@tonic-gate 		switch (errh_flt.errh_er.desc) {
223*0Sstevel@tonic-gate 		case ERRH_DESC_PR_NRE:
224*0Sstevel@tonic-gate 			/*
225*0Sstevel@tonic-gate 			 * Fall through, precise fault also need to check
226*0Sstevel@tonic-gate 			 * to see if it was protected.
227*0Sstevel@tonic-gate 			 */
228*0Sstevel@tonic-gate 
229*0Sstevel@tonic-gate 		case ERRH_DESC_DEF_NRE:
230*0Sstevel@tonic-gate 			/*
231*0Sstevel@tonic-gate 			 * If the trap occurred in privileged mode at TL=0,
232*0Sstevel@tonic-gate 			 * we need to check to see if we were executing
233*0Sstevel@tonic-gate 			 * in kernel under on_trap() or t_lofault
234*0Sstevel@tonic-gate 			 * protection. If so, modify the saved registers
235*0Sstevel@tonic-gate 			 * so that we return from the trap to the
236*0Sstevel@tonic-gate 			 * appropriate trampoline routine.
237*0Sstevel@tonic-gate 			 */
238*0Sstevel@tonic-gate 			if (aflt->flt_priv == 1 && aflt->flt_tl == 0)
239*0Sstevel@tonic-gate 				trampolined =
240*0Sstevel@tonic-gate 				    errh_error_protected(rp, aflt, &expected);
241*0Sstevel@tonic-gate 
242*0Sstevel@tonic-gate 			if (!aflt->flt_priv || aflt->flt_prot ==
243*0Sstevel@tonic-gate 			    AFLT_PROT_COPY) {
244*0Sstevel@tonic-gate 				aflt->flt_panic |= aft_panic;
245*0Sstevel@tonic-gate 			} else if (!trampolined &&
246*0Sstevel@tonic-gate 			    aflt->flt_class != BUS_FAULT) {
247*0Sstevel@tonic-gate 				aflt->flt_panic = 1;
248*0Sstevel@tonic-gate 			}
249*0Sstevel@tonic-gate 
250*0Sstevel@tonic-gate 			/*
251*0Sstevel@tonic-gate 			 * If PIO error, we need to query the bus nexus
252*0Sstevel@tonic-gate 			 * for fatal errors.
253*0Sstevel@tonic-gate 			 */
254*0Sstevel@tonic-gate 			if (aflt->flt_class == BUS_FAULT) {
255*0Sstevel@tonic-gate 				aflt->flt_addr = errh_flt.errh_er.ra;
256*0Sstevel@tonic-gate 				errh_cpu_run_bus_error_handlers(aflt,
257*0Sstevel@tonic-gate 				    expected);
258*0Sstevel@tonic-gate 			}
259*0Sstevel@tonic-gate 
260*0Sstevel@tonic-gate 			break;
261*0Sstevel@tonic-gate 
262*0Sstevel@tonic-gate 		default:
263*0Sstevel@tonic-gate 			cmn_err(CE_WARN, "Error Descriptor 0x%llx "
264*0Sstevel@tonic-gate 			    " invalid in nonresumable error handler",
265*0Sstevel@tonic-gate 			    (long long) errh_flt.errh_er.desc);
266*0Sstevel@tonic-gate 			continue;
267*0Sstevel@tonic-gate 		}
268*0Sstevel@tonic-gate 
269*0Sstevel@tonic-gate 		/*
270*0Sstevel@tonic-gate 		 * Queue the error report for further processing. If
271*0Sstevel@tonic-gate 		 * flt_panic is set, code still process other errors
272*0Sstevel@tonic-gate 		 * in the queue until the panic routine stops the
273*0Sstevel@tonic-gate 		 * kernel.
274*0Sstevel@tonic-gate 		 */
275*0Sstevel@tonic-gate 		(void) cpu_queue_one_event(&errh_flt);
276*0Sstevel@tonic-gate 
277*0Sstevel@tonic-gate 		/*
278*0Sstevel@tonic-gate 		 * Panic here if aflt->flt_panic has been set.
279*0Sstevel@tonic-gate 		 * Enqueued errors will be logged as part of the panic flow.
280*0Sstevel@tonic-gate 		 */
281*0Sstevel@tonic-gate 		if (aflt->flt_panic) {
282*0Sstevel@tonic-gate 			fm_panic("Unrecoverable hardware error");
283*0Sstevel@tonic-gate 		}
284*0Sstevel@tonic-gate 
285*0Sstevel@tonic-gate 		/*
286*0Sstevel@tonic-gate 		 * If it is a memory error, we turn on the PAGE_IS_TOXIC
287*0Sstevel@tonic-gate 		 * flag. The page will be retired later and scrubbed when
288*0Sstevel@tonic-gate 		 * it is freed.
289*0Sstevel@tonic-gate 		 */
290*0Sstevel@tonic-gate 		if (errh_flt.errh_er.attr & ERRH_ATTR_MEM)
291*0Sstevel@tonic-gate 			(void) errh_page_settoxic(&errh_flt, PAGE_IS_TOXIC);
292*0Sstevel@tonic-gate 
293*0Sstevel@tonic-gate 		/*
294*0Sstevel@tonic-gate 		 * If we queued an error and the it was in user mode or
295*0Sstevel@tonic-gate 		 * protected by t_lofault,
296*0Sstevel@tonic-gate 		 * set AST flag so the queue will be drained before
297*0Sstevel@tonic-gate 		 * returning to user mode.
298*0Sstevel@tonic-gate 		 */
299*0Sstevel@tonic-gate 		if (!aflt->flt_priv || aflt->flt_prot == AFLT_PROT_COPY) {
300*0Sstevel@tonic-gate 			int pcb_flag = 0;
301*0Sstevel@tonic-gate 
302*0Sstevel@tonic-gate 			if (aflt->flt_class == CPU_FAULT)
303*0Sstevel@tonic-gate 				pcb_flag |= ASYNC_HWERR;
304*0Sstevel@tonic-gate 			else if (aflt->flt_class == BUS_FAULT)
305*0Sstevel@tonic-gate 				pcb_flag |= ASYNC_BERR;
306*0Sstevel@tonic-gate 
307*0Sstevel@tonic-gate 			ttolwp(curthread)->lwp_pcb.pcb_flags |= pcb_flag;
308*0Sstevel@tonic-gate 			aston(curthread);
309*0Sstevel@tonic-gate 		}
310*0Sstevel@tonic-gate 	}
311*0Sstevel@tonic-gate }
312*0Sstevel@tonic-gate 
313*0Sstevel@tonic-gate /*
314*0Sstevel@tonic-gate  * For PIO errors, this routine calls nexus driver's error
315*0Sstevel@tonic-gate  * callback routines. If the callback routine returns fatal, and
316*0Sstevel@tonic-gate  * we are in kernel or unknow mode without any error protection,
317*0Sstevel@tonic-gate  * we need to turn on the panic flag.
318*0Sstevel@tonic-gate  */
319*0Sstevel@tonic-gate void
320*0Sstevel@tonic-gate errh_cpu_run_bus_error_handlers(struct async_flt *aflt, int expected)
321*0Sstevel@tonic-gate {
322*0Sstevel@tonic-gate 	int status;
323*0Sstevel@tonic-gate 	ddi_fm_error_t de;
324*0Sstevel@tonic-gate 
325*0Sstevel@tonic-gate 	bzero(&de, sizeof (ddi_fm_error_t));
326*0Sstevel@tonic-gate 
327*0Sstevel@tonic-gate 	de.fme_version = DDI_FME_VERSION;
328*0Sstevel@tonic-gate 	de.fme_ena = fm_ena_generate(aflt->flt_id, FM_ENA_FMT1);
329*0Sstevel@tonic-gate 	de.fme_flag = expected;
330*0Sstevel@tonic-gate 	de.fme_bus_specific = (void *)aflt->flt_addr;
331*0Sstevel@tonic-gate 	status = ndi_fm_handler_dispatch(ddi_root_node(), NULL, &de);
332*0Sstevel@tonic-gate 
333*0Sstevel@tonic-gate 	/*
334*0Sstevel@tonic-gate 	 * If error is protected, it will jump to proper routine
335*0Sstevel@tonic-gate 	 * to handle the handle; if it is in user level, we just
336*0Sstevel@tonic-gate 	 * kill the user process; if the driver thinks the error is
337*0Sstevel@tonic-gate 	 * not fatal, we can drive on. If none of above are true,
338*0Sstevel@tonic-gate 	 * we panic
339*0Sstevel@tonic-gate 	 */
340*0Sstevel@tonic-gate 	if ((aflt->flt_prot == AFLT_PROT_NONE) && (aflt->flt_priv == 1) &&
341*0Sstevel@tonic-gate 	    (status == DDI_FM_FATAL))
342*0Sstevel@tonic-gate 		aflt->flt_panic = 1;
343*0Sstevel@tonic-gate }
344*0Sstevel@tonic-gate 
345*0Sstevel@tonic-gate /*
346*0Sstevel@tonic-gate  * This routine checks to see if we are under any error protection when
347*0Sstevel@tonic-gate  * the error happens. If we are under error protection, we unwind to
348*0Sstevel@tonic-gate  * the protection and indicate fault.
349*0Sstevel@tonic-gate  */
350*0Sstevel@tonic-gate static int
351*0Sstevel@tonic-gate errh_error_protected(struct regs *rp, struct async_flt *aflt, int *expected)
352*0Sstevel@tonic-gate {
353*0Sstevel@tonic-gate 	int trampolined = 0;
354*0Sstevel@tonic-gate 	ddi_acc_hdl_t *hp;
355*0Sstevel@tonic-gate 
356*0Sstevel@tonic-gate 	if (curthread->t_ontrap != NULL) {
357*0Sstevel@tonic-gate 		on_trap_data_t *otp = curthread->t_ontrap;
358*0Sstevel@tonic-gate 
359*0Sstevel@tonic-gate 		if (otp->ot_prot & OT_DATA_EC) {
360*0Sstevel@tonic-gate 			aflt->flt_prot = AFLT_PROT_EC;
361*0Sstevel@tonic-gate 			otp->ot_trap |= OT_DATA_EC;
362*0Sstevel@tonic-gate 			rp->r_pc = otp->ot_trampoline;
363*0Sstevel@tonic-gate 			rp->r_npc = rp->r_pc +4;
364*0Sstevel@tonic-gate 			trampolined = 1;
365*0Sstevel@tonic-gate 		}
366*0Sstevel@tonic-gate 
367*0Sstevel@tonic-gate 		if (otp->ot_prot & OT_DATA_ACCESS) {
368*0Sstevel@tonic-gate 			aflt->flt_prot = AFLT_PROT_ACCESS;
369*0Sstevel@tonic-gate 			otp->ot_trap |= OT_DATA_ACCESS;
370*0Sstevel@tonic-gate 			rp->r_pc = otp->ot_trampoline;
371*0Sstevel@tonic-gate 			rp->r_npc = rp->r_pc + 4;
372*0Sstevel@tonic-gate 			trampolined = 1;
373*0Sstevel@tonic-gate 			/*
374*0Sstevel@tonic-gate 			 * for peek and caut_gets
375*0Sstevel@tonic-gate 			 * errors are expected
376*0Sstevel@tonic-gate 			 */
377*0Sstevel@tonic-gate 			hp = (ddi_acc_hdl_t *)otp->ot_handle;
378*0Sstevel@tonic-gate 			if (!hp)
379*0Sstevel@tonic-gate 				*expected = DDI_FM_ERR_PEEK;
380*0Sstevel@tonic-gate 			else if (hp->ah_acc.devacc_attr_access ==
381*0Sstevel@tonic-gate 			    DDI_CAUTIOUS_ACC)
382*0Sstevel@tonic-gate 				*expected = DDI_FM_ERR_EXPECTED;
383*0Sstevel@tonic-gate 		}
384*0Sstevel@tonic-gate 	} else if (curthread->t_lofault) {
385*0Sstevel@tonic-gate 		aflt->flt_prot = AFLT_PROT_COPY;
386*0Sstevel@tonic-gate 		rp->r_g1 = EFAULT;
387*0Sstevel@tonic-gate 		rp->r_pc = curthread->t_lofault;
388*0Sstevel@tonic-gate 		rp->r_npc = rp->r_pc + 4;
389*0Sstevel@tonic-gate 		trampolined = 1;
390*0Sstevel@tonic-gate 	}
391*0Sstevel@tonic-gate 
392*0Sstevel@tonic-gate 	return (trampolined);
393*0Sstevel@tonic-gate }
394*0Sstevel@tonic-gate 
395*0Sstevel@tonic-gate /*
396*0Sstevel@tonic-gate  * Queue one event.
397*0Sstevel@tonic-gate  */
398*0Sstevel@tonic-gate static void
399*0Sstevel@tonic-gate cpu_queue_one_event(errh_async_flt_t *errh_fltp)
400*0Sstevel@tonic-gate {
401*0Sstevel@tonic-gate 	struct async_flt *aflt = (struct async_flt *)errh_fltp;
402*0Sstevel@tonic-gate 	errorq_t *eqp;
403*0Sstevel@tonic-gate 
404*0Sstevel@tonic-gate 	if (aflt->flt_panic)
405*0Sstevel@tonic-gate 		eqp = ue_queue;
406*0Sstevel@tonic-gate 	else
407*0Sstevel@tonic-gate 		eqp = ce_queue;
408*0Sstevel@tonic-gate 
409*0Sstevel@tonic-gate 	errorq_dispatch(eqp, errh_fltp, sizeof (errh_async_flt_t),
410*0Sstevel@tonic-gate 	    aflt->flt_panic);
411*0Sstevel@tonic-gate }
412*0Sstevel@tonic-gate 
413*0Sstevel@tonic-gate /*
414*0Sstevel@tonic-gate  * The cpu_async_log_err() function is called by the ce/ue_drain() function to
415*0Sstevel@tonic-gate  * handle logging for CPU events that are dequeued.  As such, it can be invoked
416*0Sstevel@tonic-gate  * from softint context, from AST processing in the trap() flow, or from the
417*0Sstevel@tonic-gate  * panic flow.  We decode the CPU-specific data, and log appropriate messages.
418*0Sstevel@tonic-gate  */
419*0Sstevel@tonic-gate void
420*0Sstevel@tonic-gate cpu_async_log_err(void *flt)
421*0Sstevel@tonic-gate {
422*0Sstevel@tonic-gate 	errh_async_flt_t *errh_fltp = (errh_async_flt_t *)flt;
423*0Sstevel@tonic-gate 	errh_er_t *errh_erp = (errh_er_t *)&errh_fltp->errh_er;
424*0Sstevel@tonic-gate 
425*0Sstevel@tonic-gate 	switch (errh_erp->desc) {
426*0Sstevel@tonic-gate 	case ERRH_DESC_UCOR_RE:
427*0Sstevel@tonic-gate 		if (errh_erp->attr & ERRH_ATTR_MEM) {
428*0Sstevel@tonic-gate 			/*
429*0Sstevel@tonic-gate 			 * Turn on the PAGE_IS_TOXIC flag. The page will be
430*0Sstevel@tonic-gate 			 * scrubbed when it is freed.
431*0Sstevel@tonic-gate 			 */
432*0Sstevel@tonic-gate 			(void) errh_page_settoxic(errh_fltp, PAGE_IS_TOXIC);
433*0Sstevel@tonic-gate 		}
434*0Sstevel@tonic-gate 
435*0Sstevel@tonic-gate 		break;
436*0Sstevel@tonic-gate 
437*0Sstevel@tonic-gate 	case ERRH_DESC_PR_NRE:
438*0Sstevel@tonic-gate 	case ERRH_DESC_DEF_NRE:
439*0Sstevel@tonic-gate 		if (errh_erp->attr & ERRH_ATTR_MEM) {
440*0Sstevel@tonic-gate 			/*
441*0Sstevel@tonic-gate 			 * For non-resumable memory error, retire
442*0Sstevel@tonic-gate 			 * the page here.
443*0Sstevel@tonic-gate 			 */
444*0Sstevel@tonic-gate 			errh_page_retire(errh_fltp);
445*0Sstevel@tonic-gate 		}
446*0Sstevel@tonic-gate 		break;
447*0Sstevel@tonic-gate 
448*0Sstevel@tonic-gate 	default:
449*0Sstevel@tonic-gate 		break;
450*0Sstevel@tonic-gate 	}
451*0Sstevel@tonic-gate }
452*0Sstevel@tonic-gate 
453*0Sstevel@tonic-gate /*
454*0Sstevel@tonic-gate  * Called from ce_drain().
455*0Sstevel@tonic-gate  */
456*0Sstevel@tonic-gate void
457*0Sstevel@tonic-gate cpu_ce_log_err(struct async_flt *aflt)
458*0Sstevel@tonic-gate {
459*0Sstevel@tonic-gate 	switch (aflt->flt_class) {
460*0Sstevel@tonic-gate 	case CPU_FAULT:
461*0Sstevel@tonic-gate 		cpu_async_log_err(aflt);
462*0Sstevel@tonic-gate 		break;
463*0Sstevel@tonic-gate 
464*0Sstevel@tonic-gate 	case BUS_FAULT:
465*0Sstevel@tonic-gate 		cpu_async_log_err(aflt);
466*0Sstevel@tonic-gate 		break;
467*0Sstevel@tonic-gate 
468*0Sstevel@tonic-gate 	default:
469*0Sstevel@tonic-gate 		break;
470*0Sstevel@tonic-gate 	}
471*0Sstevel@tonic-gate }
472*0Sstevel@tonic-gate 
473*0Sstevel@tonic-gate /*
474*0Sstevel@tonic-gate  * Called from ue_drain().
475*0Sstevel@tonic-gate  */
476*0Sstevel@tonic-gate void
477*0Sstevel@tonic-gate cpu_ue_log_err(struct async_flt *aflt)
478*0Sstevel@tonic-gate {
479*0Sstevel@tonic-gate 	switch (aflt->flt_class) {
480*0Sstevel@tonic-gate 	case CPU_FAULT:
481*0Sstevel@tonic-gate 		cpu_async_log_err(aflt);
482*0Sstevel@tonic-gate 		break;
483*0Sstevel@tonic-gate 
484*0Sstevel@tonic-gate 	case BUS_FAULT:
485*0Sstevel@tonic-gate 		cpu_async_log_err(aflt);
486*0Sstevel@tonic-gate 		break;
487*0Sstevel@tonic-gate 
488*0Sstevel@tonic-gate 	default:
489*0Sstevel@tonic-gate 		break;
490*0Sstevel@tonic-gate 	}
491*0Sstevel@tonic-gate }
492*0Sstevel@tonic-gate 
493*0Sstevel@tonic-gate /*
494*0Sstevel@tonic-gate  * Turn on flag on the error memory region.
495*0Sstevel@tonic-gate  */
496*0Sstevel@tonic-gate static void
497*0Sstevel@tonic-gate errh_page_settoxic(errh_async_flt_t *errh_fltp, uchar_t flag)
498*0Sstevel@tonic-gate {
499*0Sstevel@tonic-gate 	page_t *pp;
500*0Sstevel@tonic-gate 	uint64_t flt_real_addr_start = errh_fltp->errh_er.ra;
501*0Sstevel@tonic-gate 	uint64_t flt_real_addr_end = flt_real_addr_start +
502*0Sstevel@tonic-gate 	    errh_fltp->errh_er.sz - 1;
503*0Sstevel@tonic-gate 	int64_t current_addr;
504*0Sstevel@tonic-gate 
505*0Sstevel@tonic-gate 	if (errh_fltp->errh_er.sz == 0)
506*0Sstevel@tonic-gate 		return;
507*0Sstevel@tonic-gate 
508*0Sstevel@tonic-gate 	for (current_addr = flt_real_addr_start;
509*0Sstevel@tonic-gate 	    current_addr < flt_real_addr_end; current_addr += MMU_PAGESIZE) {
510*0Sstevel@tonic-gate 		pp = page_numtopp_nolock((pfn_t)
511*0Sstevel@tonic-gate 		    (current_addr >> MMU_PAGESHIFT));
512*0Sstevel@tonic-gate 
513*0Sstevel@tonic-gate 		if (pp != NULL) {
514*0Sstevel@tonic-gate 			page_settoxic(pp, flag);
515*0Sstevel@tonic-gate 		}
516*0Sstevel@tonic-gate 	}
517*0Sstevel@tonic-gate }
518*0Sstevel@tonic-gate 
519*0Sstevel@tonic-gate /*
520*0Sstevel@tonic-gate  * Retire the page(s) indicated in the error report.
521*0Sstevel@tonic-gate  */
522*0Sstevel@tonic-gate static void
523*0Sstevel@tonic-gate errh_page_retire(errh_async_flt_t *errh_fltp)
524*0Sstevel@tonic-gate {
525*0Sstevel@tonic-gate 	page_t *pp;
526*0Sstevel@tonic-gate 	uint64_t flt_real_addr_start = errh_fltp->errh_er.ra;
527*0Sstevel@tonic-gate 	uint64_t flt_real_addr_end = flt_real_addr_start +
528*0Sstevel@tonic-gate 	    errh_fltp->errh_er.sz - 1;
529*0Sstevel@tonic-gate 	int64_t current_addr;
530*0Sstevel@tonic-gate 
531*0Sstevel@tonic-gate 	if (errh_fltp->errh_er.sz == 0)
532*0Sstevel@tonic-gate 		return;
533*0Sstevel@tonic-gate 
534*0Sstevel@tonic-gate 	for (current_addr = flt_real_addr_start;
535*0Sstevel@tonic-gate 	    current_addr < flt_real_addr_end; current_addr += MMU_PAGESIZE) {
536*0Sstevel@tonic-gate 		pp = page_numtopp_nolock((pfn_t)
537*0Sstevel@tonic-gate 		    (current_addr >> MMU_PAGESHIFT));
538*0Sstevel@tonic-gate 
539*0Sstevel@tonic-gate 		if (pp != NULL) {
540*0Sstevel@tonic-gate 			(void) page_retire(pp, PAGE_IS_TOXIC);
541*0Sstevel@tonic-gate 		}
542*0Sstevel@tonic-gate 	}
543*0Sstevel@tonic-gate }
544*0Sstevel@tonic-gate 
545*0Sstevel@tonic-gate void
546*0Sstevel@tonic-gate mem_scrub(uint64_t paddr, uint64_t len)
547*0Sstevel@tonic-gate {
548*0Sstevel@tonic-gate 	uint64_t pa, length, scrubbed_len;
549*0Sstevel@tonic-gate 	uint64_t ret = H_EOK;
550*0Sstevel@tonic-gate 
551*0Sstevel@tonic-gate 	pa = paddr;
552*0Sstevel@tonic-gate 	length = len;
553*0Sstevel@tonic-gate 	scrubbed_len = 0;
554*0Sstevel@tonic-gate 
555*0Sstevel@tonic-gate 	while (ret == H_EOK) {
556*0Sstevel@tonic-gate 		ret = hv_mem_scrub(pa, length, &scrubbed_len);
557*0Sstevel@tonic-gate 
558*0Sstevel@tonic-gate 		if (ret == H_EOK || scrubbed_len >= length) {
559*0Sstevel@tonic-gate 			break;
560*0Sstevel@tonic-gate 		}
561*0Sstevel@tonic-gate 
562*0Sstevel@tonic-gate 		pa += scrubbed_len;
563*0Sstevel@tonic-gate 		length -= scrubbed_len;
564*0Sstevel@tonic-gate 	}
565*0Sstevel@tonic-gate }
566*0Sstevel@tonic-gate 
567*0Sstevel@tonic-gate void
568*0Sstevel@tonic-gate mem_sync(caddr_t va, size_t len)
569*0Sstevel@tonic-gate {
570*0Sstevel@tonic-gate 	uint64_t pa, length, flushed;
571*0Sstevel@tonic-gate 	uint64_t ret = H_EOK;
572*0Sstevel@tonic-gate 
573*0Sstevel@tonic-gate 	pa = va_to_pa((caddr_t)va);
574*0Sstevel@tonic-gate 
575*0Sstevel@tonic-gate 	if (pa == (uint64_t)-1)
576*0Sstevel@tonic-gate 		return;
577*0Sstevel@tonic-gate 
578*0Sstevel@tonic-gate 	length = len;
579*0Sstevel@tonic-gate 	flushed = 0;
580*0Sstevel@tonic-gate 
581*0Sstevel@tonic-gate 	while (ret == H_EOK) {
582*0Sstevel@tonic-gate 		ret = hv_mem_sync(pa, length, &flushed);
583*0Sstevel@tonic-gate 
584*0Sstevel@tonic-gate 		if (ret == H_EOK || flushed >= length) {
585*0Sstevel@tonic-gate 			break;
586*0Sstevel@tonic-gate 		}
587*0Sstevel@tonic-gate 
588*0Sstevel@tonic-gate 		pa += flushed;
589*0Sstevel@tonic-gate 		length -= flushed;
590*0Sstevel@tonic-gate 	}
591*0Sstevel@tonic-gate }
592*0Sstevel@tonic-gate 
593*0Sstevel@tonic-gate /*
594*0Sstevel@tonic-gate  * If resumable queue is full, we need to check if any cpu is in
595*0Sstevel@tonic-gate  * error state. If not, we drive on. If yes, we need to panic. The
596*0Sstevel@tonic-gate  * hypervisor call hv_cpu_state() is being used for checking the
597*0Sstevel@tonic-gate  * cpu state.
598*0Sstevel@tonic-gate  */
599*0Sstevel@tonic-gate static void
600*0Sstevel@tonic-gate errh_rq_full(struct async_flt *afltp)
601*0Sstevel@tonic-gate {
602*0Sstevel@tonic-gate 	processorid_t who;
603*0Sstevel@tonic-gate 	uint64_t cpu_state;
604*0Sstevel@tonic-gate 	uint64_t retval;
605*0Sstevel@tonic-gate 
606*0Sstevel@tonic-gate 	for (who = 0; who < NCPU; who++)
607*0Sstevel@tonic-gate 		if (CPU_IN_SET(cpu_ready_set, who)) {
608*0Sstevel@tonic-gate 			retval = hv_cpu_state(who, &cpu_state);
609*0Sstevel@tonic-gate 			if (retval != H_EOK || cpu_state == CPU_STATE_ERROR) {
610*0Sstevel@tonic-gate 				afltp->flt_panic = 1;
611*0Sstevel@tonic-gate 				break;
612*0Sstevel@tonic-gate 			}
613*0Sstevel@tonic-gate 		}
614*0Sstevel@tonic-gate }
615*0Sstevel@tonic-gate 
616*0Sstevel@tonic-gate /*
617*0Sstevel@tonic-gate  * Return processor specific async error structure
618*0Sstevel@tonic-gate  * size used.
619*0Sstevel@tonic-gate  */
620*0Sstevel@tonic-gate int
621*0Sstevel@tonic-gate cpu_aflt_size(void)
622*0Sstevel@tonic-gate {
623*0Sstevel@tonic-gate 	return (sizeof (errh_async_flt_t));
624*0Sstevel@tonic-gate }
625*0Sstevel@tonic-gate 
626*0Sstevel@tonic-gate #define	SZ_TO_ETRS_SHIFT	6
627*0Sstevel@tonic-gate 
628*0Sstevel@tonic-gate /*
629*0Sstevel@tonic-gate  * Message print out when resumable queue is overflown
630*0Sstevel@tonic-gate  */
631*0Sstevel@tonic-gate /*ARGSUSED*/
632*0Sstevel@tonic-gate void
633*0Sstevel@tonic-gate rq_overflow(struct regs *rp, uint64_t head_offset,
634*0Sstevel@tonic-gate     uint64_t tail_offset)
635*0Sstevel@tonic-gate {
636*0Sstevel@tonic-gate 	rq_overflow_count++;
637*0Sstevel@tonic-gate }
638*0Sstevel@tonic-gate 
639*0Sstevel@tonic-gate /*
640*0Sstevel@tonic-gate  * Handler to process a fatal error.  This routine can be called from a
641*0Sstevel@tonic-gate  * softint, called from trap()'s AST handling, or called from the panic flow.
642*0Sstevel@tonic-gate  */
643*0Sstevel@tonic-gate /*ARGSUSED*/
644*0Sstevel@tonic-gate static void
645*0Sstevel@tonic-gate ue_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep)
646*0Sstevel@tonic-gate {
647*0Sstevel@tonic-gate 	cpu_ue_log_err(aflt);
648*0Sstevel@tonic-gate }
649*0Sstevel@tonic-gate 
650*0Sstevel@tonic-gate /*
651*0Sstevel@tonic-gate  * Handler to process a correctable error.  This routine can be called from a
652*0Sstevel@tonic-gate  * softint.  We just call the CPU module's logging routine.
653*0Sstevel@tonic-gate  */
654*0Sstevel@tonic-gate /*ARGSUSED*/
655*0Sstevel@tonic-gate static void
656*0Sstevel@tonic-gate ce_drain(void *ignored, struct async_flt *aflt, errorq_elem_t *eqep)
657*0Sstevel@tonic-gate {
658*0Sstevel@tonic-gate 	cpu_ce_log_err(aflt);
659*0Sstevel@tonic-gate }
660*0Sstevel@tonic-gate 
661*0Sstevel@tonic-gate /*
662*0Sstevel@tonic-gate  * Allocate error queue sizes based on max_ncpus.  max_ncpus is set just
663*0Sstevel@tonic-gate  * after ncpunode has been determined.  ncpus is set in start_other_cpus
664*0Sstevel@tonic-gate  * which is called after error_init() but may change dynamically.
665*0Sstevel@tonic-gate  */
666*0Sstevel@tonic-gate void
667*0Sstevel@tonic-gate error_init(void)
668*0Sstevel@tonic-gate {
669*0Sstevel@tonic-gate 	char tmp_name[MAXSYSNAME];
670*0Sstevel@tonic-gate 	dnode_t node;
671*0Sstevel@tonic-gate 	size_t size = cpu_aflt_size();
672*0Sstevel@tonic-gate 
673*0Sstevel@tonic-gate 	/*
674*0Sstevel@tonic-gate 	 * Initialize the correctable and uncorrectable error queues.
675*0Sstevel@tonic-gate 	 */
676*0Sstevel@tonic-gate 	ue_queue = errorq_create("ue_queue", (errorq_func_t)ue_drain, NULL,
677*0Sstevel@tonic-gate 	    MAX_ASYNC_FLTS * (max_ncpus + 1), size, PIL_2, ERRORQ_VITAL);
678*0Sstevel@tonic-gate 
679*0Sstevel@tonic-gate 	ce_queue = errorq_create("ce_queue", (errorq_func_t)ce_drain, NULL,
680*0Sstevel@tonic-gate 	    MAX_CE_FLTS * (max_ncpus + 1), size, PIL_1, 0);
681*0Sstevel@tonic-gate 
682*0Sstevel@tonic-gate 	if (ue_queue == NULL || ce_queue == NULL)
683*0Sstevel@tonic-gate 		panic("failed to create required system error queue");
684*0Sstevel@tonic-gate 
685*0Sstevel@tonic-gate 	/*
686*0Sstevel@tonic-gate 	 * Initialize the busfunc list mutex.  This must be a PIL_15 spin lock
687*0Sstevel@tonic-gate 	 * because we will need to acquire it from cpu_async_error().
688*0Sstevel@tonic-gate 	 */
689*0Sstevel@tonic-gate 	mutex_init(&bfd_lock, NULL, MUTEX_SPIN, (void *)PIL_15);
690*0Sstevel@tonic-gate 
691*0Sstevel@tonic-gate 	node = prom_rootnode();
692*0Sstevel@tonic-gate 	if ((node == OBP_NONODE) || (node == OBP_BADNODE)) {
693*0Sstevel@tonic-gate 		cmn_err(CE_CONT, "error_init: node 0x%x\n", (uint_t)node);
694*0Sstevel@tonic-gate 		return;
695*0Sstevel@tonic-gate 	}
696*0Sstevel@tonic-gate 
697*0Sstevel@tonic-gate 	if (((size = prom_getproplen(node, "reset-reason")) != -1) &&
698*0Sstevel@tonic-gate 	    (size <= MAXSYSNAME) &&
699*0Sstevel@tonic-gate 	    (prom_getprop(node, "reset-reason", tmp_name) != -1)) {
700*0Sstevel@tonic-gate 		if (reset_debug) {
701*0Sstevel@tonic-gate 			cmn_err(CE_CONT, "System booting after %s\n", tmp_name);
702*0Sstevel@tonic-gate 		} else if (strncmp(tmp_name, "FATAL", 5) == 0) {
703*0Sstevel@tonic-gate 			cmn_err(CE_CONT,
704*0Sstevel@tonic-gate 			    "System booting after fatal error %s\n", tmp_name);
705*0Sstevel@tonic-gate 		}
706*0Sstevel@tonic-gate 	}
707*0Sstevel@tonic-gate }
708