xref: /onnv-gate/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_cpuerr.c (revision 10784:15baf8dd1081)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Ereport-handling routines for CPU errors
28  */
29 
30 #include <cmd_cpu.h>
31 #include <cmd.h>
32 
33 #include <strings.h>
34 #include <string.h>
35 #include <errno.h>
36 #include <fm/fmd_api.h>
37 #include <sys/fm/protocol.h>
38 #include <sys/async.h>
39 #ifdef sun4u
40 #include <sys/fm/cpu/UltraSPARC-III.h>
41 #include <cmd_Lxcache.h>
42 #include <cmd_opl.h>
43 #endif
44 
45 /*
46  * We follow the same algorithm for handling all L1$, TLB, and L2/L3 cache
47  * tag events so we can have one common routine into which each handler
48  * calls.  The two tests of (strcmp(serdnm, "") != 0) are used to eliminate
49  * the need for a separate macro for UEs which override SERD engine
50  * counting CEs leading to same fault.
51  */
52 /*ARGSUSED9*/
53 static cmd_evdisp_t
cmd_cpuerr_common(fmd_hdl_t * hdl,fmd_event_t * ep,cmd_cpu_t * cpu,cmd_case_t * cc,cmd_ptrsubtype_t pstype,const char * serdnm,const char * serdn,const char * serdt,const char * fltnm,cmd_errcl_t clcode)54 cmd_cpuerr_common(fmd_hdl_t *hdl, fmd_event_t *ep, cmd_cpu_t *cpu,
55     cmd_case_t *cc, cmd_ptrsubtype_t pstype, const char *serdnm,
56     const char *serdn, const char *serdt, const char *fltnm,
57     cmd_errcl_t clcode)
58 {
59 	const char *uuid;
60 
61 	if (cc->cc_cp != NULL && fmd_case_solved(hdl, cc->cc_cp))
62 		return (CMD_EVD_REDUND);
63 
64 	if (cc->cc_cp == NULL) {
65 		cc->cc_cp = cmd_case_create(hdl, &cpu->cpu_header, pstype,
66 		    &uuid);
67 		if (strcmp(serdnm, "") != 0) {
68 			cc->cc_serdnm = cmd_cpu_serdnm_create(hdl, cpu,
69 			    serdnm);
70 			fmd_serd_create(hdl, cc->cc_serdnm,
71 			    fmd_prop_get_int32(hdl, serdn),
72 			    fmd_prop_get_int64(hdl, serdt));
73 		}
74 	}
75 
76 	if (strcmp(serdnm, "") != 0) {
77 		fmd_hdl_debug(hdl, "adding event to %s\n", cc->cc_serdnm);
78 		if (fmd_serd_record(hdl, cc->cc_serdnm, ep) == FMD_B_FALSE)
79 			return (CMD_EVD_OK); /* serd engine hasn't fired yet */
80 
81 		fmd_case_add_serd(hdl, cc->cc_cp, cc->cc_serdnm);
82 	} else {
83 		if (cc->cc_serdnm != NULL) {
84 			fmd_hdl_debug(hdl,
85 			    "destroying existing %s state for class %x\n",
86 			    cc->cc_serdnm, clcode);
87 			fmd_serd_destroy(hdl, cc->cc_serdnm);
88 			fmd_hdl_strfree(hdl, cc->cc_serdnm);
89 			cc->cc_serdnm = NULL;
90 		}
91 		fmd_case_reset(hdl, cc->cc_cp);
92 		fmd_case_add_ereport(hdl, cc->cc_cp, ep);
93 	}
94 
95 	cmd_cpu_create_faultlist(hdl, cc->cc_cp, cpu, fltnm, NULL, 100);
96 
97 	fmd_case_solve(hdl, cc->cc_cp);
98 
99 	return (CMD_EVD_OK);
100 }
101 #ifdef sun4u
102 
103 #define	CMD_CPU_TAGHANDLER(name, casenm, ptr, ntname, fltname)	\
104 cmd_evdisp_t								\
105 cmd_##name(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,		\
106     const char *class, cmd_errcl_t clcode)				\
107 {									\
108 	uint8_t level = clcode & CMD_ERRCL_LEVEL_EXTRACT;		\
109 	cmd_cpu_t *cpu;							\
110 									\
111 	clcode &= CMD_ERRCL_LEVEL_MASK;					\
112 	if ((cpu = cmd_cpu_lookup_from_detector(hdl, nvl, class,	\
113 	    level)) == NULL || cpu->cpu_faulting)			\
114 		return (CMD_EVD_UNUSED);				\
115 									\
116 	if ((strstr(class, "ultraSPARC-IVplus.l3-thce") != 0) ||	\
117 		(strstr(class, "ultraSPARC-IVplus.thce") != 0)) {	\
118 		return (cmd_us4plus_tag_err(hdl, ep, nvl, cpu,	\
119 		    ptr, ntname "_n", ntname "_t", fltname, clcode));	\
120 	}								\
121 	return (cmd_cpuerr_common(hdl, ep, cpu, &cpu->cpu_##casenm,	\
122 	    ptr, ntname, ntname "_n", ntname "_t", fltname, clcode));	\
123 }
124 #endif
125 
126 #define	CMD_CPU_SIMPLEHANDLER(name, casenm, ptr, ntname, fltname)	\
127 cmd_evdisp_t								\
128 cmd_##name(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,		\
129     const char *class, cmd_errcl_t clcode)				\
130 {									\
131 	uint8_t level = clcode & CMD_ERRCL_LEVEL_EXTRACT;		\
132 	cmd_cpu_t *cpu;							\
133 									\
134 	clcode &= CMD_ERRCL_LEVEL_MASK;					\
135 	if ((cpu = cmd_cpu_lookup_from_detector(hdl, nvl, class,	\
136 	    level)) == NULL || cpu->cpu_faulting)			\
137 		return (CMD_EVD_UNUSED);				\
138 									\
139 	return (cmd_cpuerr_common(hdl, ep, cpu, &cpu->cpu_##casenm,	\
140 	    ptr, ntname, ntname "_n", ntname "_t", fltname, clcode));	\
141 }
142 
143 #ifdef sun4u
144 CMD_CPU_TAGHANDLER(txce, l2tag, CMD_PTR_CPU_L2TAG, "l2tag", "l2cachetag")
145 CMD_CPU_TAGHANDLER(l3_thce, l3tag, CMD_PTR_CPU_L3TAG, "l3tag", "l3cachetag")
146 #else
147 CMD_CPU_SIMPLEHANDLER(txce, l2tag, CMD_PTR_CPU_L2TAG, "l2tag", "l2cachetag")
148 CMD_CPU_SIMPLEHANDLER(l3_thce, l3tag, CMD_PTR_CPU_L3TAG, "l3tag", "l3cachetag")
149 #endif
150 CMD_CPU_SIMPLEHANDLER(icache, icache, CMD_PTR_CPU_ICACHE, "icache", "icache")
151 CMD_CPU_SIMPLEHANDLER(dcache, dcache, CMD_PTR_CPU_DCACHE, "dcache", "dcache")
152 CMD_CPU_SIMPLEHANDLER(pcache, pcache, CMD_PTR_CPU_PCACHE, "pcache", "pcache")
153 CMD_CPU_SIMPLEHANDLER(itlb, itlb, CMD_PTR_CPU_ITLB, "itlb", "itlb")
154 CMD_CPU_SIMPLEHANDLER(dtlb, dtlb, CMD_PTR_CPU_DTLB, "dtlb", "dtlb")
155 CMD_CPU_SIMPLEHANDLER(irc, ireg, CMD_PTR_CPU_IREG, "ireg", "ireg")
156 CMD_CPU_SIMPLEHANDLER(frc, freg, CMD_PTR_CPU_FREG, "freg", "freg")
157 CMD_CPU_SIMPLEHANDLER(mau, mau, CMD_PTR_CPU_MAU, "mau", "mau")
158 CMD_CPU_SIMPLEHANDLER(miscregs_ce, misc_regs, CMD_PTR_CPU_MISC_REGS,
159 	"misc_regs", "misc_reg")
160 CMD_CPU_SIMPLEHANDLER(l2c, l2data, CMD_PTR_CPU_L2DATA, "l2data", "l2data-c")
161 
162 CMD_CPU_SIMPLEHANDLER(fpu, fpu, CMD_PTR_CPU_FPU, "", "fpu")
163 CMD_CPU_SIMPLEHANDLER(l2ctl, l2ctl, CMD_PTR_CPU_L2CTL, "", "l2cachectl")
164 CMD_CPU_SIMPLEHANDLER(iru, ireg, CMD_PTR_CPU_IREG, "", "ireg")
165 CMD_CPU_SIMPLEHANDLER(fru, freg, CMD_PTR_CPU_FREG, "", "freg")
166 CMD_CPU_SIMPLEHANDLER(miscregs_ue, misc_regs, CMD_PTR_CPU_MISC_REGS,
167 	"", "misc_reg")
168 CMD_CPU_SIMPLEHANDLER(l2u, l2data, CMD_PTR_CPU_L2DATA, "", "l2data-u")
169 CMD_CPU_SIMPLEHANDLER(lfu_ue, lfu, CMD_PTR_CPU_LFU, "", "lfu-u")
170 CMD_CPU_SIMPLEHANDLER(lfu_ce, lfu, CMD_PTR_CPU_LFU, "", "lfu-f")
171 CMD_CPU_SIMPLEHANDLER(lfu_pe, lfu, CMD_PTR_CPU_LFU, "", "lfu-p")
172 
173 
174 
175 /*
176  * Fp-scrubber errors
177  */
178 cmd_evdisp_t
cmd_fps(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)179 cmd_fps(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
180 		const char *class, cmd_errcl_t clcode)
181 {
182 	uint8_t level = clcode & CMD_ERRCL_LEVEL_EXTRACT;
183 	cmd_cpu_t *cpu;
184 	nvlist_t *res;
185 
186 	clcode &= CMD_ERRCL_LEVEL_MASK;
187 
188 	/*
189 	 * Ignore the event if resource FMRI is not present. Fp-Scrubber
190 	 * puts the indicted CPU in resource. If resource is not present,
191 	 * we cannot diagnose the ereport. It will simply get logged in
192 	 * errlog for manual analysis, if needed.
193 	 */
194 	if (nvlist_lookup_nvlist(nvl, "resource", &res))
195 		return (CMD_EVD_UNUSED);
196 
197 	if ((cpu = cmd_cpu_lookup(hdl, res, class, level)) == NULL ||
198 	    cpu->cpu_faulting)
199 		return (CMD_EVD_UNUSED);
200 
201 	return (cmd_cpuerr_common(hdl, ep, cpu, &cpu->cpu_fpu,
202 	    CMD_PTR_CPU_FPU, "", "_n", "_t", "fpu", clcode));
203 }
204 
205 
206 
207 
208 #ifdef sun4u
209 /*
210  * The following macro handles UEs or CPU errors.
211  * It handles the error cases in which there is with or
212  * without "resource".
213  *
214  * If the "fltname" "core" is to be generated, the sibling CPUs
215  * within the core will be added to the suspect list.
216  * If the "fltname" "chip" is to be generated, the sibling CPUs
217  * within the chip will be added to the suspect list.
218  * If the "fltname" "strand" is to be generated, the strand
219  * itself will be in the suspect list.
220  */
221 #define	CMD_OPL_UEHANDLER(name, casenm, ptr, fltname, has_rsrc)		\
222 cmd_evdisp_t								\
223 cmd_##name(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,		\
224     const char *class, cmd_errcl_t clcode)				\
225 {									\
226 	cmd_cpu_t *cpu;							\
227 	cmd_case_t *cc;							\
228 	cmd_evdisp_t rc;						\
229 	nvlist_t  *rsrc = NULL;						\
230 	uint8_t cpumask, version = 1;					\
231 	uint8_t lookup_rsrc = has_rsrc;					\
232 									\
233 	fmd_hdl_debug(hdl,						\
234 	    "Enter cmd_opl_ue_cpu for class %x\n", clcode);		\
235 									\
236 	if (lookup_rsrc) {						\
237 		if (nvlist_lookup_nvlist(nvl,				\
238 		    FM_EREPORT_PAYLOAD_NAME_RESOURCE, &rsrc) != 0)	\
239 			return (CMD_EVD_BAD);				\
240 									\
241 		if ((cpu = cmd_cpu_lookup(hdl, rsrc, class,		\
242 		    CMD_CPU_LEVEL_THREAD)) == NULL ||			\
243 		    cpu->cpu_faulting)					\
244 			return (CMD_EVD_UNUSED);			\
245 	} else {							\
246 		if ((cpu = cmd_cpu_lookup_from_detector(hdl, nvl, class,\
247 		    CMD_CPU_LEVEL_THREAD)) == NULL || cpu->cpu_faulting)\
248 			return (CMD_EVD_UNUSED);			\
249 									\
250 		(void) nvlist_lookup_nvlist(nvl,			\
251 		    FM_EREPORT_DETECTOR, &rsrc);			\
252 	}								\
253 									\
254 	if (nvlist_lookup_uint8(rsrc, FM_VERSION, &version) != 0 ||	\
255 	    version > FM_CPU_SCHEME_VERSION ||				\
256 	    nvlist_lookup_uint8(rsrc, FM_FMRI_CPU_MASK, &cpumask) != 0)	\
257 		return (CMD_EVD_BAD);					\
258 									\
259 	cc = &cpu->cpu_##casenm;					\
260 	rc = cmd_opl_ue_cpu(hdl, ep, class, fltname,			\
261 	    ptr, cpu, cc, cpumask);					\
262 	return (rc);							\
263 }
264 
265 /*
266  * CPU errors without resource
267  */
268 CMD_OPL_UEHANDLER(oplinv_urg, opl_inv_urg, CMD_PTR_CPU_UGESR_INV_URG, "core", 0)
269 CMD_OPL_UEHANDLER(oplcre, opl_cre, CMD_PTR_CPU_UGESR_CRE, "core", 0)
270 CMD_OPL_UEHANDLER(opltsb_ctx, opl_tsb_ctx, CMD_PTR_CPU_UGESR_TSB_CTX, "core", 0)
271 CMD_OPL_UEHANDLER(opltsbp, opl_tsbp, CMD_PTR_CPU_UGESR_TSBP, "core", 0)
272 CMD_OPL_UEHANDLER(oplpstate, opl_pstate, CMD_PTR_CPU_UGESR_PSTATE, "core", 0)
273 CMD_OPL_UEHANDLER(opltstate, opl_tstate, CMD_PTR_CPU_UGESR_TSTATE, "core", 0)
274 CMD_OPL_UEHANDLER(opliug_f, opl_iug_f, CMD_PTR_CPU_UGESR_IUG_F, "core", 0)
275 CMD_OPL_UEHANDLER(opliug_r, opl_iug_r, CMD_PTR_CPU_UGESR_IUG_R, "core", 0)
276 CMD_OPL_UEHANDLER(oplsdc, opl_sdc, CMD_PTR_CPU_UGESR_SDC, "chip", 0)
277 CMD_OPL_UEHANDLER(oplwdt, opl_wdt, CMD_PTR_CPU_UGESR_WDT, "core", 0)
278 CMD_OPL_UEHANDLER(opldtlb, opl_dtlb, CMD_PTR_CPU_UGESR_DTLB, "core", 0)
279 CMD_OPL_UEHANDLER(oplitlb, opl_itlb, CMD_PTR_CPU_UGESR_ITLB, "core", 0)
280 CMD_OPL_UEHANDLER(oplcore_err, opl_core_err, CMD_PTR_CPU_UGESR_CORE_ERR,
281 "core", 0)
282 CMD_OPL_UEHANDLER(opldae, opl_dae, CMD_PTR_CPU_UGESR_DAE, "core", 0)
283 CMD_OPL_UEHANDLER(opliae, opl_iae, CMD_PTR_CPU_UGESR_IAE, "core", 0)
284 CMD_OPL_UEHANDLER(opluge, opl_uge, CMD_PTR_CPU_UGESR_UGE, "core", 0)
285 
286 /*
287  * UEs with resource
288  */
289 CMD_OPL_UEHANDLER(oplinv_sfsr, opl_invsfsr, CMD_PTR_CPU_INV_SFSR, "strand", 1)
290 CMD_OPL_UEHANDLER(opluecpu_detcpu, oplue_detcpu, CMD_PTR_CPU_UE_DET_CPU,
291 "core", 1)
292 CMD_OPL_UEHANDLER(opluecpu_detio, oplue_detio, CMD_PTR_CPU_UE_DET_IO, "core", 1)
293 CMD_OPL_UEHANDLER(oplmtlb, opl_mtlb, CMD_PTR_CPU_MTLB, "core", 1)
294 CMD_OPL_UEHANDLER(opltlbp, opl_tlbp, CMD_PTR_CPU_TLBP, "core", 1)
295 #endif	/* sun4u */
296 
297 /*ARGSUSED*/
298 static void
cmd_nop_hdlr(fmd_hdl_t * hdl,cmd_xr_t * xr,fmd_event_t * ep)299 cmd_nop_hdlr(fmd_hdl_t *hdl, cmd_xr_t *xr, fmd_event_t *ep)
300 {
301 	fmd_hdl_debug(hdl, "nop train resolved for clcode %llx\n",
302 	    xr->xr_clcode);
303 }
304 /*ARGSUSED*/
305 static void
cmd_xxu_hdlr(fmd_hdl_t * hdl,cmd_xr_t * xr,fmd_event_t * ep)306 cmd_xxu_hdlr(fmd_hdl_t *hdl, cmd_xr_t *xr, fmd_event_t *ep)
307 {
308 	const errdata_t *ed;
309 	cmd_cpu_t *cpu = xr->xr_cpu;
310 	cmd_case_t *cc;
311 	const char *uuid;
312 	nvlist_t *rsrc = NULL;
313 
314 	cmd_fill_errdata(xr->xr_clcode, cpu, &cc, &ed);
315 
316 	if (cpu->cpu_faulting) {
317 		CMD_STAT_BUMP(xxu_retr_flt);
318 		return;
319 	}
320 
321 	if (cmd_afar_status_check(xr->xr_afar_status, xr->xr_clcode) < 0) {
322 		fmd_hdl_debug(hdl, "xxU dropped, afar not VALID\n");
323 		return;
324 	}
325 
326 	if (cmd_cpu_synd_check(xr->xr_synd, xr->xr_clcode) < 0) {
327 		fmd_hdl_debug(hdl, "xxU/LDxU dropped due to syndrome\n");
328 		return;
329 	}
330 
331 #ifdef sun4u
332 	/*
333 	 * UE cache needed for sun4u only, because sun4u doesn't poison
334 	 * uncorrectable data loaded into L2/L3 cache.
335 	 */
336 	if (cmd_cpu_uec_match(xr->xr_cpu, xr->xr_afar)) {
337 		fmd_hdl_debug(hdl, "ue matched in UE cache\n");
338 		CMD_STAT_BUMP(xxu_ue_match);
339 		return;
340 	}
341 #endif /* sun4u */
342 
343 	/*
344 	 * We didn't match in the UE cache.  We don't need to sleep for UE
345 	 * arrival, as we've already slept once for the train match.
346 	 */
347 
348 	if (cc->cc_cp == NULL) {
349 		cc->cc_cp = cmd_case_create(hdl, &cpu->cpu_header, ed->ed_pst,
350 		    &uuid);
351 	} else if (cc->cc_serdnm != NULL) {
352 		fmd_hdl_debug(hdl, "destroying existing %s state\n",
353 		    cc->cc_serdnm);
354 
355 		fmd_serd_destroy(hdl, cc->cc_serdnm);
356 		fmd_hdl_strfree(hdl, cc->cc_serdnm);
357 		cc->cc_serdnm = NULL;
358 
359 		fmd_case_reset(hdl, cc->cc_cp);
360 	}
361 
362 	if (xr->xr_rsrc_nvl != NULL && nvlist_dup(xr->xr_rsrc_nvl,
363 	    &rsrc, 0) != 0) {
364 		fmd_hdl_abort(hdl, "failed to duplicate resource FMRI for "
365 		    "%s fault", ed->ed_fltnm);
366 	}
367 
368 	fmd_case_add_ereport(hdl, cc->cc_cp, ep);
369 
370 	cmd_cpu_create_faultlist(hdl, cc->cc_cp, cpu, ed->ed_fltnm, rsrc, 100);
371 	nvlist_free(rsrc);
372 	fmd_case_solve(hdl, cc->cc_cp);
373 }
374 
375 static void
cmd_xxc_hdlr(fmd_hdl_t * hdl,cmd_xr_t * xr,fmd_event_t * ep)376 cmd_xxc_hdlr(fmd_hdl_t *hdl, cmd_xr_t *xr, fmd_event_t *ep)
377 {
378 	const errdata_t *ed;
379 	cmd_cpu_t *cpu = xr->xr_cpu;
380 	cmd_case_t *cc;
381 	const char *uuid;
382 	nvlist_t *rsrc = NULL;
383 
384 #ifdef	sun4u
385 	if (cmd_cache_ce_panther(hdl, ep, xr) == 0) {
386 		return;
387 	}
388 #endif
389 	cmd_fill_errdata(xr->xr_clcode, cpu, &cc, &ed);
390 
391 	if (cpu->cpu_faulting || (cc->cc_cp != NULL &&
392 	    fmd_case_solved(hdl, cc->cc_cp)))
393 		return;
394 
395 	if (cc->cc_cp == NULL) {
396 		cc->cc_cp = cmd_case_create(hdl, &cpu->cpu_header, ed->ed_pst,
397 		    &uuid);
398 		cc->cc_serdnm = cmd_cpu_serdnm_create(hdl, cpu,
399 		    ed->ed_serd->cs_name);
400 
401 		fmd_serd_create(hdl, cc->cc_serdnm, ed->ed_serd->cs_n,
402 		    ed->ed_serd->cs_t);
403 	}
404 
405 	fmd_hdl_debug(hdl, "adding event to %s\n", cc->cc_serdnm);
406 
407 	if (fmd_serd_record(hdl, cc->cc_serdnm, ep) == FMD_B_FALSE)
408 		return; /* serd engine hasn't fired yet */
409 
410 	if (xr->xr_rsrc_nvl != NULL && nvlist_dup(xr->xr_rsrc_nvl,
411 	    &rsrc, 0) != 0) {
412 		fmd_hdl_abort(hdl, "failed to duplicate resource FMRI for "
413 		    "%s fault", ed->ed_fltnm);
414 	}
415 
416 	fmd_case_add_serd(hdl, cc->cc_cp, cc->cc_serdnm);
417 	cmd_cpu_create_faultlist(hdl, cc->cc_cp, cpu, ed->ed_fltnm, rsrc, 100);
418 	nvlist_free(rsrc);
419 	fmd_case_solve(hdl, cc->cc_cp);
420 }
421 
422 /*
423  * We're back from the timeout.  Check to see if this event was part of a train.
424  * If it was, make sure to only process the cause of the train.  If not,
425  * process the event directly.
426  */
427 static void
cmd_xxcu_resolve(fmd_hdl_t * hdl,cmd_xr_t * xr,fmd_event_t * ep,cmd_xr_hdlr_f * hdlr)428 cmd_xxcu_resolve(fmd_hdl_t *hdl, cmd_xr_t *xr, fmd_event_t *ep,
429     cmd_xr_hdlr_f *hdlr)
430 {
431 	cmd_xxcu_trw_t *trw;
432 	cmd_errcl_t cause;
433 	uint64_t afar;
434 
435 
436 	afar = NULL;
437 
438 	if (xr->xr_afar_status == AFLT_STAT_VALID)
439 		afar = xr->xr_afar;
440 
441 	if ((trw = cmd_trw_lookup(xr->xr_ena,
442 	    xr->xr_afar_status, afar)) == NULL) {
443 		fmd_hdl_debug(hdl, "cmd_trw_lookup: Not found\n");
444 		return;
445 	}
446 
447 	fmd_hdl_debug(hdl, "found waiter with mask 0x%08llx\n", trw->trw_mask);
448 
449 	trw->trw_flags |= CMD_TRW_F_DELETING;
450 
451 	/*
452 	 * In sun4v, the matching train rule is changed. It matches only
453 	 * a portion of the train mask, so can't discard the rest of
454 	 * the error in the train mask.
455 	 */
456 #ifdef sun4u
457 	if (trw->trw_flags & CMD_TRW_F_CAUSESEEN) {
458 		fmd_hdl_debug(hdl, "cause already seen -- discarding\n");
459 		goto done;
460 	}
461 #endif
462 
463 	if ((cause = cmd_train_match(trw->trw_mask, xr->xr_clcode)) == 0) {
464 		/*
465 		 * We didn't match in a train, so we're going to process each
466 		 * event individually.
467 		 */
468 		fmd_hdl_debug(hdl, "didn't match in a train\n");
469 		hdlr(hdl, xr, ep);
470 		goto done;
471 	}
472 
473 	fmd_hdl_debug(hdl, "found a match for train.  cause is %llx, "
474 	    "this is %llx\n", cause, xr->xr_clcode);
475 
476 	/*
477 	 * We've got a train match.  If this event is the cause of the train,
478 	 * process it.
479 	 */
480 	if (cause == xr->xr_clcode) {
481 		trw->trw_flags |= CMD_TRW_F_CAUSESEEN;
482 		hdlr(hdl, xr, ep);
483 	}
484 
485 done:
486 	cmd_trw_deref(hdl, trw);
487 }
488 
489 void
cmd_xxc_resolve(fmd_hdl_t * hdl,cmd_xr_t * xr,fmd_event_t * ep)490 cmd_xxc_resolve(fmd_hdl_t *hdl, cmd_xr_t *xr, fmd_event_t *ep)
491 {
492 	cmd_xxcu_resolve(hdl, xr, ep, cmd_xxc_hdlr);
493 }
494 
495 void
cmd_xxu_resolve(fmd_hdl_t * hdl,cmd_xr_t * xr,fmd_event_t * ep)496 cmd_xxu_resolve(fmd_hdl_t *hdl, cmd_xr_t *xr, fmd_event_t *ep)
497 {
498 	cmd_xxcu_resolve(hdl, xr, ep, cmd_xxu_hdlr);
499 }
500 
501 void
cmd_nop_resolve(fmd_hdl_t * hdl,cmd_xr_t * xr,fmd_event_t * ep)502 cmd_nop_resolve(fmd_hdl_t *hdl, cmd_xr_t *xr, fmd_event_t *ep)
503 {
504 	cmd_xxcu_resolve(hdl, xr, ep, cmd_nop_hdlr);
505 }
506 
507 cmd_evdisp_t
cmd_xxcu_initial(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode,uint_t hdlrid)508 cmd_xxcu_initial(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
509     const char *class, cmd_errcl_t clcode, uint_t hdlrid)
510 {
511 	cmd_xxcu_trw_t *trw;
512 	cmd_case_t *cc;
513 	cmd_cpu_t *cpu;
514 	cmd_xr_t *xr;
515 	uint64_t ena;
516 	uint64_t afar;
517 	uint8_t level = clcode & CMD_ERRCL_LEVEL_EXTRACT;
518 	uint8_t	afar_status;
519 	const errdata_t *ed = NULL;
520 	int ref_incremented = 0;
521 
522 	clcode &= CMD_ERRCL_LEVEL_MASK; /* keep level bits out of train masks */
523 
524 	if ((cpu = cmd_cpu_lookup_from_detector(hdl, nvl, class,
525 	    level)) == NULL || cpu->cpu_faulting)
526 		return (CMD_EVD_UNUSED);
527 
528 	cmd_fill_errdata(clcode, cpu, &cc, &ed);
529 
530 	if (cc->cc_cp != NULL && fmd_case_solved(hdl, cc->cc_cp))
531 		return (CMD_EVD_REDUND);
532 
533 	(void) nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena);
534 
535 	if (cmd_afar_valid(hdl, nvl, clcode, &afar) != 0) {
536 		afar_status = AFLT_STAT_INVALID;
537 		afar = NULL;
538 	} else {
539 		afar_status = AFLT_STAT_VALID;
540 	}
541 
542 	fmd_hdl_debug(hdl, "scheduling %s (%llx) for redelivery\n",
543 	    class, clcode);
544 	fmd_hdl_debug(hdl, "looking up ena %llx,afar %llx with\n", ena, afar);
545 
546 	fmd_hdl_debug(hdl, "afar status of %02x\n", afar_status);
547 
548 	if ((trw = cmd_trw_lookup(ena, afar_status, afar)) == NULL) {
549 		if ((trw = cmd_trw_alloc(ena, afar)) == NULL) {
550 			fmd_hdl_debug(hdl, "failed to get new trw\n");
551 			goto redeliver;
552 		}
553 	}
554 
555 	if (trw->trw_flags & CMD_TRW_F_DELETING)
556 		goto redeliver;
557 
558 	if (trw->trw_mask & clcode) {
559 		fmd_hdl_debug(hdl, "clcode %llx is already in trw "
560 		    "(mask %llx)\n", clcode, trw->trw_mask);
561 		return (CMD_EVD_UNUSED);
562 	}
563 
564 	cmd_trw_ref(hdl, trw, clcode);
565 	ref_incremented++;
566 
567 	fmd_hdl_debug(hdl, "trw rescheduled for train delivery\n");
568 
569 redeliver:
570 	if ((xr = cmd_xr_create(hdl, ep, nvl, cpu, clcode)) == NULL) {
571 		fmd_hdl_debug(hdl, "cmd_xr_create failed");
572 		if (ref_incremented)
573 			cmd_trw_deref(hdl, trw);
574 		return (CMD_EVD_BAD);
575 	}
576 
577 	return (cmd_xr_reschedule(hdl, xr, hdlrid));
578 }
579 
580 
581 cmd_evdisp_t
cmd_xxu(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)582 cmd_xxu(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
583     cmd_errcl_t clcode)
584 {
585 	return (cmd_xxcu_initial(hdl, ep, nvl, class, clcode, CMD_XR_HDLR_XXU));
586 }
587 
588 cmd_evdisp_t
cmd_xxc(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)589 cmd_xxc(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
590     cmd_errcl_t clcode)
591 {
592 	return (cmd_xxcu_initial(hdl, ep, nvl, class, clcode, CMD_XR_HDLR_XXC));
593 }
594 
595 cmd_evdisp_t
cmd_nop_train(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)596 cmd_nop_train(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
597     const char *class, cmd_errcl_t clcode)
598 {
599 	return (cmd_xxcu_initial(hdl, ep, nvl, class, clcode, CMD_XR_HDLR_NOP));
600 }
601 
602 cmd_evdisp_t
cmd_miscregs_train(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)603 cmd_miscregs_train(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
604     const char *class, cmd_errcl_t clcode)
605 {
606 	return (cmd_xxcu_initial(hdl, ep, nvl, class, clcode,
607 	    CMD_XR_HDLR_XXC));
608 }
609 
610 void
cmd_cpuerr_close(fmd_hdl_t * hdl,void * arg)611 cmd_cpuerr_close(fmd_hdl_t *hdl, void *arg)
612 {
613 	cmd_cpu_destroy(hdl, arg);
614 }
615