xref: /onnv-gate/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_memerr.c (revision 12467:1f2119e1bc03)
1600Stsien /*
2600Stsien  * CDDL HEADER START
3600Stsien  *
4600Stsien  * The contents of this file are subject to the terms of the
51752Sgavinm  * Common Development and Distribution License (the "License").
61752Sgavinm  * You may not use this file except in compliance with the License.
7600Stsien  *
8600Stsien  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9600Stsien  * or http://www.opensolaris.org/os/licensing.
10600Stsien  * See the License for the specific language governing permissions
11600Stsien  * and limitations under the License.
12600Stsien  *
13600Stsien  * When distributing Covered Code, include this CDDL HEADER in each
14600Stsien  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15600Stsien  * If applicable, add the following below this CDDL HEADER, with the
16600Stsien  * fields enclosed by brackets "[]" replaced with your own identifying
17600Stsien  * information: Portions Copyright [yyyy] [name of copyright owner]
18600Stsien  *
19600Stsien  * CDDL HEADER END
20600Stsien  */
21600Stsien /*
2212144SLouis.Tsien@Sun.COM  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23600Stsien  */
24600Stsien 
25600Stsien /*
26600Stsien  * Ereport-handling routines for memory errors
27600Stsien  */
28600Stsien 
29600Stsien #include <cmd_mem.h>
30600Stsien #include <cmd_dimm.h>
31600Stsien #include <cmd_bank.h>
32600Stsien #include <cmd_page.h>
33600Stsien #include <cmd_cpu.h>
341186Sayznaga #ifdef sun4u
351186Sayznaga #include <cmd_dp.h>
361186Sayznaga #include <cmd_dp_page.h>
371186Sayznaga #endif
38600Stsien #include <cmd.h>
39600Stsien 
40600Stsien #include <strings.h>
41600Stsien #include <string.h>
42600Stsien #include <errno.h>
43*12467STrang.Do@Sun.COM #include <limits.h>
44600Stsien #include <fm/fmd_api.h>
45600Stsien #include <sys/fm/protocol.h>
46600Stsien #include <sys/async.h>
47600Stsien #include <sys/errclassify.h>
488016STom.Pothier@Sun.COM #include <assert.h>
49600Stsien 
506828Stsien #ifdef sun4v
516828Stsien #include <cmd_hc_sun4v.h>
526828Stsien #endif /* sun4v */
536828Stsien 
54600Stsien struct ce_name2type {
55600Stsien 	const char *name;
56600Stsien 	ce_dispact_t type;
57600Stsien };
58600Stsien 
59600Stsien ce_dispact_t
cmd_mem_name2type(const char * name,int minorvers)60600Stsien cmd_mem_name2type(const char *name, int minorvers)
61600Stsien {
62600Stsien 	static const struct ce_name2type old[] = {
63600Stsien 		{ ERR_TYPE_DESC_INTERMITTENT,	CE_DISP_INTERMITTENT },
64600Stsien 		{ ERR_TYPE_DESC_PERSISTENT,	CE_DISP_PERS },
65600Stsien 		{ ERR_TYPE_DESC_STICKY,		CE_DISP_STICKY },
66600Stsien 		{ ERR_TYPE_DESC_UNKNOWN,	CE_DISP_UNKNOWN },
67600Stsien 		{ NULL }
68600Stsien 	};
69600Stsien 	static const struct ce_name2type new[] = {
70600Stsien 		{ CE_DISP_DESC_U,		CE_DISP_UNKNOWN },
71600Stsien 		{ CE_DISP_DESC_I,		CE_DISP_INTERMITTENT },
72600Stsien 		{ CE_DISP_DESC_PP,		CE_DISP_POSS_PERS },
73600Stsien 		{ CE_DISP_DESC_P,		CE_DISP_PERS },
74600Stsien 		{ CE_DISP_DESC_L,		CE_DISP_LEAKY },
75600Stsien 		{ CE_DISP_DESC_PS,		CE_DISP_POSS_STICKY },
76600Stsien 		{ CE_DISP_DESC_S,		CE_DISP_STICKY },
77600Stsien 		{ NULL }
78600Stsien 	};
79600Stsien 	const struct ce_name2type *names = (minorvers == 0) ? &old[0] : &new[0];
80600Stsien 	const struct ce_name2type *tp;
81600Stsien 
82600Stsien 	for (tp = names; tp->name != NULL; tp++)
83600Stsien 		if (strcasecmp(name, tp->name) == 0)
84600Stsien 			return (tp->type);
85600Stsien 
86600Stsien 	return (CE_DISP_UNKNOWN);
87600Stsien }
88600Stsien 
89*12467STrang.Do@Sun.COM /*
90*12467STrang.Do@Sun.COM  * check if a dimm has n CEs with the same symbol-in-error
91*12467STrang.Do@Sun.COM  */
92*12467STrang.Do@Sun.COM static int
upos_thresh_check(cmd_dimm_t * dimm,uint16_t upos,uint32_t threshold)93*12467STrang.Do@Sun.COM upos_thresh_check(cmd_dimm_t *dimm, uint16_t upos, uint32_t threshold)
94*12467STrang.Do@Sun.COM {
95*12467STrang.Do@Sun.COM 	int i;
96*12467STrang.Do@Sun.COM 	cmd_mq_t *ip, *next;
97*12467STrang.Do@Sun.COM 	int count = 0;
98*12467STrang.Do@Sun.COM 
99*12467STrang.Do@Sun.COM 	for (i = 0; i < CMD_MAX_CKWDS; i++) {
100*12467STrang.Do@Sun.COM 		for (ip = cmd_list_next(&dimm->mq_root[i]); ip != NULL;
101*12467STrang.Do@Sun.COM 		    ip = next) {
102*12467STrang.Do@Sun.COM 			next = cmd_list_next(ip);
103*12467STrang.Do@Sun.COM 			if (ip->mq_unit_position == upos) {
104*12467STrang.Do@Sun.COM 				count++;
105*12467STrang.Do@Sun.COM 				if (count >= threshold)
106*12467STrang.Do@Sun.COM 					return (1);
107*12467STrang.Do@Sun.COM 			}
108*12467STrang.Do@Sun.COM 		}
109*12467STrang.Do@Sun.COM 	}
110*12467STrang.Do@Sun.COM 	return (0);
111*12467STrang.Do@Sun.COM }
112*12467STrang.Do@Sun.COM 
113*12467STrang.Do@Sun.COM /*
114*12467STrang.Do@Sun.COM  * check if smaller number of retired pages > 1/16 of larger
115*12467STrang.Do@Sun.COM  * number of retired pages
116*12467STrang.Do@Sun.COM  */
117*12467STrang.Do@Sun.COM static int
check_bad_rw_retired_pages(fmd_hdl_t * hdl,cmd_dimm_t * d1,cmd_dimm_t * d2)118*12467STrang.Do@Sun.COM check_bad_rw_retired_pages(fmd_hdl_t *hdl, cmd_dimm_t *d1, cmd_dimm_t *d2)
119*12467STrang.Do@Sun.COM {
120*12467STrang.Do@Sun.COM 	uint_t sret, lret;
121*12467STrang.Do@Sun.COM 	double ratio;
122*12467STrang.Do@Sun.COM 	uint_t d1_nretired, d2_nretired;
123*12467STrang.Do@Sun.COM 
124*12467STrang.Do@Sun.COM 	sret = lret = 0;
125*12467STrang.Do@Sun.COM 
126*12467STrang.Do@Sun.COM 	d1_nretired = d1->dimm_nretired;
127*12467STrang.Do@Sun.COM 	d2_nretired = d2->dimm_nretired;
128*12467STrang.Do@Sun.COM 
129*12467STrang.Do@Sun.COM 	if (d1->dimm_bank != NULL)
130*12467STrang.Do@Sun.COM 		d1_nretired += d1->dimm_bank->bank_nretired;
131*12467STrang.Do@Sun.COM 
132*12467STrang.Do@Sun.COM 	if (d2->dimm_bank != NULL)
133*12467STrang.Do@Sun.COM 		d2_nretired += d2->dimm_bank->bank_nretired;
134*12467STrang.Do@Sun.COM 
135*12467STrang.Do@Sun.COM 	if (d2_nretired < d1_nretired) {
136*12467STrang.Do@Sun.COM 		sret = d2_nretired;
137*12467STrang.Do@Sun.COM 		lret = d1_nretired;
138*12467STrang.Do@Sun.COM 	} else if (d2_nretired > d1_nretired) {
139*12467STrang.Do@Sun.COM 		sret = d1_nretired;
140*12467STrang.Do@Sun.COM 		lret = d2_nretired;
141*12467STrang.Do@Sun.COM 	} else
142*12467STrang.Do@Sun.COM 		return (0);
143*12467STrang.Do@Sun.COM 
144*12467STrang.Do@Sun.COM 	ratio = lret * CMD_PAGE_RATIO;
145*12467STrang.Do@Sun.COM 
146*12467STrang.Do@Sun.COM 	if (sret > ratio) {
147*12467STrang.Do@Sun.COM 		fmd_hdl_debug(hdl, "sret=%d lret=%d ratio=%.3f\n",
148*12467STrang.Do@Sun.COM 		    sret, lret, ratio);
149*12467STrang.Do@Sun.COM 		return (1);
150*12467STrang.Do@Sun.COM 	}
151*12467STrang.Do@Sun.COM 	return (0);
152*12467STrang.Do@Sun.COM }
153*12467STrang.Do@Sun.COM 
154*12467STrang.Do@Sun.COM /*
155*12467STrang.Do@Sun.COM  * check bad rw between two DIMMs
156*12467STrang.Do@Sun.COM  * the check succeeds if
157*12467STrang.Do@Sun.COM  * - each DIMM has 4 CEs with the same symbol-in-error.
158*12467STrang.Do@Sun.COM  * - the smaller number of retired pages > 1/16 larger number of retired pages
159*12467STrang.Do@Sun.COM  */
160*12467STrang.Do@Sun.COM static int
check_bad_rw_between_dimms(fmd_hdl_t * hdl,cmd_dimm_t * d1,cmd_dimm_t * d2,uint16_t * rupos)161*12467STrang.Do@Sun.COM check_bad_rw_between_dimms(fmd_hdl_t *hdl, cmd_dimm_t *d1, cmd_dimm_t *d2,
162*12467STrang.Do@Sun.COM     uint16_t *rupos)
163*12467STrang.Do@Sun.COM {
164*12467STrang.Do@Sun.COM 	int i;
165*12467STrang.Do@Sun.COM 	cmd_mq_t *ip, *next;
166*12467STrang.Do@Sun.COM 	uint16_t upos;
167*12467STrang.Do@Sun.COM 
168*12467STrang.Do@Sun.COM 	for (i = 0; i < CMD_MAX_CKWDS; i++) {
169*12467STrang.Do@Sun.COM 		for (ip = cmd_list_next(&d1->mq_root[i]); ip != NULL;
170*12467STrang.Do@Sun.COM 		    ip = next) {
171*12467STrang.Do@Sun.COM 			next = cmd_list_next(ip);
172*12467STrang.Do@Sun.COM 			upos = ip->mq_unit_position;
173*12467STrang.Do@Sun.COM 			if (upos_thresh_check(d1, upos, cmd.cmd_nupos)) {
174*12467STrang.Do@Sun.COM 				if (upos_thresh_check(d2, upos,
175*12467STrang.Do@Sun.COM 				    cmd.cmd_nupos)) {
176*12467STrang.Do@Sun.COM 					if (check_bad_rw_retired_pages(hdl,
177*12467STrang.Do@Sun.COM 					    d1, d2)) {
178*12467STrang.Do@Sun.COM 						*rupos = upos;
179*12467STrang.Do@Sun.COM 						return (1);
180*12467STrang.Do@Sun.COM 					}
181*12467STrang.Do@Sun.COM 				}
182*12467STrang.Do@Sun.COM 			}
183*12467STrang.Do@Sun.COM 		}
184*12467STrang.Do@Sun.COM 	}
185*12467STrang.Do@Sun.COM 
186*12467STrang.Do@Sun.COM 	return (0);
187*12467STrang.Do@Sun.COM }
188*12467STrang.Do@Sun.COM 
189*12467STrang.Do@Sun.COM static void
bad_reader_writer_check(fmd_hdl_t * hdl,cmd_dimm_t * ce_dimm,nvlist_t * det)190*12467STrang.Do@Sun.COM bad_reader_writer_check(fmd_hdl_t *hdl, cmd_dimm_t *ce_dimm, nvlist_t *det)
191*12467STrang.Do@Sun.COM {
192*12467STrang.Do@Sun.COM 	cmd_dimm_t *d, *next;
193*12467STrang.Do@Sun.COM 	uint16_t upos;
194*12467STrang.Do@Sun.COM 
195*12467STrang.Do@Sun.COM 	for (d = cmd_list_next(&cmd.cmd_dimms); d != NULL; d = next) {
196*12467STrang.Do@Sun.COM 		next = cmd_list_next(d);
197*12467STrang.Do@Sun.COM 		if (d == ce_dimm)
198*12467STrang.Do@Sun.COM 			continue;
199*12467STrang.Do@Sun.COM 		if (!cmd_same_datapath_dimms(ce_dimm, d))
200*12467STrang.Do@Sun.COM 			continue;
201*12467STrang.Do@Sun.COM 		if (check_bad_rw_between_dimms(hdl, ce_dimm, d, &upos)) {
202*12467STrang.Do@Sun.COM 			cmd_gen_datapath_fault(hdl, ce_dimm, d, upos, det);
203*12467STrang.Do@Sun.COM 			cmd_dimm_save_symbol_error(ce_dimm, upos);
204*12467STrang.Do@Sun.COM 			fmd_hdl_debug(hdl,
205*12467STrang.Do@Sun.COM 			    "check_bad_rw_dimms succeeded: %s %s",
206*12467STrang.Do@Sun.COM 			    ce_dimm->dimm_unum, d->dimm_unum);
207*12467STrang.Do@Sun.COM 			return;
208*12467STrang.Do@Sun.COM 		}
209*12467STrang.Do@Sun.COM 	}
210*12467STrang.Do@Sun.COM }
211*12467STrang.Do@Sun.COM 
212*12467STrang.Do@Sun.COM /*
213*12467STrang.Do@Sun.COM  * rule 5a checking. The check succeeds if
214*12467STrang.Do@Sun.COM  * - nretired >= 512
215*12467STrang.Do@Sun.COM  * - nretired >= 128 and (addr_hi - addr_low) / (nretired - 1) > 512KB
216*12467STrang.Do@Sun.COM  */
217600Stsien static void
ce_thresh_check(fmd_hdl_t * hdl,cmd_dimm_t * dimm)218600Stsien ce_thresh_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
219600Stsien {
220600Stsien 	nvlist_t *flt;
221600Stsien 	fmd_case_t *cp;
222*12467STrang.Do@Sun.COM 	uint_t nret;
223*12467STrang.Do@Sun.COM 	uint64_t delta_addr = 0;
224600Stsien 
225*12467STrang.Do@Sun.COM 	if (dimm->dimm_flags & CMD_MEM_F_FAULTING)
226600Stsien 		/* We've already complained about this DIMM */
227600Stsien 		return;
228600Stsien 
229600Stsien 	nret = dimm->dimm_nretired;
230600Stsien 	if (dimm->dimm_bank != NULL)
231600Stsien 		nret += dimm->dimm_bank->bank_nretired;
232600Stsien 
233*12467STrang.Do@Sun.COM 	if (nret < cmd.cmd_low_ce_thresh)
234*12467STrang.Do@Sun.COM 		return;
235*12467STrang.Do@Sun.COM 
236*12467STrang.Do@Sun.COM 	if (dimm->dimm_phys_addr_hi >= dimm->dimm_phys_addr_low)
237*12467STrang.Do@Sun.COM 		delta_addr =
238*12467STrang.Do@Sun.COM 		    (dimm->dimm_phys_addr_hi - dimm->dimm_phys_addr_low) /
239*12467STrang.Do@Sun.COM 		    (nret - 1);
240*12467STrang.Do@Sun.COM 
241*12467STrang.Do@Sun.COM 	if (nret >= cmd.cmd_hi_ce_thresh || delta_addr > CMD_MQ_512KB) {
242*12467STrang.Do@Sun.COM 
243*12467STrang.Do@Sun.COM 		dimm->dimm_flags |= CMD_MEM_F_FAULTING;
244*12467STrang.Do@Sun.COM 		cmd_dimm_dirty(hdl, dimm);
245*12467STrang.Do@Sun.COM 
246*12467STrang.Do@Sun.COM 		cp = fmd_case_open(hdl, NULL);
247*12467STrang.Do@Sun.COM 		flt = cmd_dimm_create_fault(hdl, dimm,
248*12467STrang.Do@Sun.COM 		    "fault.memory.dimm-page-retires-excessive", CMD_FLTMAXCONF);
249*12467STrang.Do@Sun.COM 		fmd_case_add_suspect(hdl, cp, flt);
250*12467STrang.Do@Sun.COM 		fmd_case_solve(hdl, cp);
251*12467STrang.Do@Sun.COM 		fmd_hdl_debug(hdl, "ce_thresh_check succeeded nretired %d\n",
252*12467STrang.Do@Sun.COM 		    nret);
253*12467STrang.Do@Sun.COM 
254*12467STrang.Do@Sun.COM 	}
255*12467STrang.Do@Sun.COM }
256600Stsien 
257*12467STrang.Do@Sun.COM /*
258*12467STrang.Do@Sun.COM  * rule 5b checking. The check succeeds if
259*12467STrang.Do@Sun.COM  * more than 120 non-intermittent CEs are reported against one symbol
260*12467STrang.Do@Sun.COM  * position of one afar in 72 hours.
261*12467STrang.Do@Sun.COM  */
262*12467STrang.Do@Sun.COM static void
mq_5b_check(fmd_hdl_t * hdl,cmd_dimm_t * dimm)263*12467STrang.Do@Sun.COM mq_5b_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
264*12467STrang.Do@Sun.COM {
265*12467STrang.Do@Sun.COM 	nvlist_t *flt;
266*12467STrang.Do@Sun.COM 	fmd_case_t *cp;
267*12467STrang.Do@Sun.COM 	cmd_mq_t *ip, *next;
268*12467STrang.Do@Sun.COM 	int cw;
269600Stsien 
270*12467STrang.Do@Sun.COM 	for (cw = 0; cw < CMD_MAX_CKWDS; cw++) {
271*12467STrang.Do@Sun.COM 		for (ip = cmd_list_next(&dimm->mq_root[cw]);
272*12467STrang.Do@Sun.COM 		    ip != NULL; ip = next) {
273*12467STrang.Do@Sun.COM 			next = cmd_list_next(ip);
274*12467STrang.Do@Sun.COM 			if (ip->mq_dupce_count >= cmd.cmd_dupce) {
275*12467STrang.Do@Sun.COM 				cp = fmd_case_open(hdl, NULL);
276*12467STrang.Do@Sun.COM 				flt = cmd_dimm_create_fault(hdl, dimm,
277*12467STrang.Do@Sun.COM 				    "fault.memory.dimm-page-retires-excessive",
278*12467STrang.Do@Sun.COM 				    CMD_FLTMAXCONF);
279*12467STrang.Do@Sun.COM 				dimm->dimm_flags |= CMD_MEM_F_FAULTING;
280*12467STrang.Do@Sun.COM 				cmd_dimm_dirty(hdl, dimm);
281*12467STrang.Do@Sun.COM 				fmd_case_add_suspect(hdl, cp, flt);
282*12467STrang.Do@Sun.COM 				fmd_case_solve(hdl, cp);
283*12467STrang.Do@Sun.COM 				fmd_hdl_debug(hdl,
284*12467STrang.Do@Sun.COM 				    "mq_5b_check succeeded: duplicate CE=%d",
285*12467STrang.Do@Sun.COM 				    ip->mq_dupce_count);
286*12467STrang.Do@Sun.COM 				return;
287*12467STrang.Do@Sun.COM 			}
288600Stsien 		}
289600Stsien 	}
290*12467STrang.Do@Sun.COM }
291600Stsien 
292*12467STrang.Do@Sun.COM /*
293*12467STrang.Do@Sun.COM  * delete the expired duplicate CE time stamps
294*12467STrang.Do@Sun.COM  */
295*12467STrang.Do@Sun.COM void
mq_prune_dup(fmd_hdl_t * hdl,cmd_mq_t * ip,uint64_t now)296*12467STrang.Do@Sun.COM mq_prune_dup(fmd_hdl_t *hdl, cmd_mq_t *ip, uint64_t now)
297*12467STrang.Do@Sun.COM {
298*12467STrang.Do@Sun.COM 	tstamp_t *tsp, *next;
299600Stsien 
300*12467STrang.Do@Sun.COM 	for (tsp = cmd_list_next(&ip->mq_dupce_tstamp); tsp != NULL;
301*12467STrang.Do@Sun.COM 	    tsp = next) {
302*12467STrang.Do@Sun.COM 		next = cmd_list_next(tsp);
303*12467STrang.Do@Sun.COM 		if (tsp->tstamp < now - CMD_MQ_TIMELIM) {
304*12467STrang.Do@Sun.COM 			cmd_list_delete(&ip->mq_dupce_tstamp, &tsp->ts_l);
305*12467STrang.Do@Sun.COM 			fmd_hdl_free(hdl, tsp, sizeof (tstamp_t));
306*12467STrang.Do@Sun.COM 			ip->mq_dupce_count--;
307600Stsien 		}
308*12467STrang.Do@Sun.COM 	}
309*12467STrang.Do@Sun.COM }
310600Stsien 
311*12467STrang.Do@Sun.COM void
mq_update(fmd_hdl_t * hdl,fmd_event_t * ep,cmd_mq_t * ip,uint64_t now,uint32_t cpuid)312*12467STrang.Do@Sun.COM mq_update(fmd_hdl_t *hdl, fmd_event_t *ep, cmd_mq_t *ip, uint64_t now,
313*12467STrang.Do@Sun.COM     uint32_t cpuid)
314*12467STrang.Do@Sun.COM {
315*12467STrang.Do@Sun.COM 	tstamp_t *tsp;
316600Stsien 
317*12467STrang.Do@Sun.COM 	ip->mq_tstamp = now;
318*12467STrang.Do@Sun.COM 	ip->mq_cpuid = cpuid;
319*12467STrang.Do@Sun.COM 	ip->mq_ep = ep;
320600Stsien 
321*12467STrang.Do@Sun.COM 	if (fmd_serd_exists(hdl, ip->mq_serdnm))
322*12467STrang.Do@Sun.COM 		fmd_serd_destroy(hdl, ip->mq_serdnm);
323*12467STrang.Do@Sun.COM 	fmd_serd_create(hdl, ip->mq_serdnm, CMD_MQ_SERDN, CMD_MQ_SERDT);
324*12467STrang.Do@Sun.COM 	(void) fmd_serd_record(hdl, ip->mq_serdnm, ep);
325*12467STrang.Do@Sun.COM 
326*12467STrang.Do@Sun.COM 	tsp = fmd_hdl_zalloc(hdl, sizeof (tstamp_t), FMD_SLEEP);
327*12467STrang.Do@Sun.COM 	tsp->tstamp = now;
328*12467STrang.Do@Sun.COM 	cmd_list_append(&ip->mq_dupce_tstamp, tsp);
329*12467STrang.Do@Sun.COM 	ip->mq_dupce_count++;
330600Stsien }
331600Stsien 
3323325Ssd77468 /* Create a fresh index block for MQSC CE correlation. */
3333325Ssd77468 cmd_mq_t *
mq_create(fmd_hdl_t * hdl,fmd_event_t * ep,uint64_t afar,uint16_t upos,uint64_t now,uint32_t cpuid)3343325Ssd77468 mq_create(fmd_hdl_t *hdl, fmd_event_t *ep,
335*12467STrang.Do@Sun.COM     uint64_t afar, uint16_t upos, uint64_t now, uint32_t cpuid)
3363325Ssd77468 {
3373325Ssd77468 	cmd_mq_t *cp;
338*12467STrang.Do@Sun.COM 	tstamp_t *tsp;
3397810STom.Pothier@Sun.COM 	uint16_t ckwd = (afar & 0x30) >> 4;
3407810STom.Pothier@Sun.COM 
3413325Ssd77468 	cp = fmd_hdl_zalloc(hdl, sizeof (cmd_mq_t), FMD_SLEEP);
3423325Ssd77468 	cp->mq_tstamp = now;
3437810STom.Pothier@Sun.COM 	cp->mq_ckwd = ckwd;
3443325Ssd77468 	cp->mq_phys_addr = afar;
3453325Ssd77468 	cp->mq_unit_position = upos;
3463325Ssd77468 	cp->mq_ep = ep;
3477810STom.Pothier@Sun.COM 	cp->mq_serdnm =
3487810STom.Pothier@Sun.COM 	    cmd_mq_serdnm_create(hdl, "mq", afar, ckwd, upos);
3497810STom.Pothier@Sun.COM 
350*12467STrang.Do@Sun.COM 	tsp = fmd_hdl_zalloc(hdl, sizeof (tstamp_t), FMD_SLEEP);
351*12467STrang.Do@Sun.COM 	tsp->tstamp = now;
352*12467STrang.Do@Sun.COM 	cmd_list_append(&cp->mq_dupce_tstamp, tsp);
353*12467STrang.Do@Sun.COM 	cp->mq_dupce_count = 1;
354*12467STrang.Do@Sun.COM 	cp->mq_cpuid = cpuid;
355*12467STrang.Do@Sun.COM 
3567810STom.Pothier@Sun.COM 	/*
3577810STom.Pothier@Sun.COM 	 * Create SERD to keep this event from being removed
3587810STom.Pothier@Sun.COM 	 * by fmd which may not know there is an event pointer
3597810STom.Pothier@Sun.COM 	 * saved here. This SERD is *never* meant to fire.
3607810STom.Pothier@Sun.COM 	 * NOTE: wouldn't need to do this if there were an fmd
3617810STom.Pothier@Sun.COM 	 * api to 'hold' an event.
3627810STom.Pothier@Sun.COM 	 */
3637810STom.Pothier@Sun.COM 	if (fmd_serd_exists(hdl, cp->mq_serdnm)) {
3647810STom.Pothier@Sun.COM 		/* clean up dup */
3657810STom.Pothier@Sun.COM 		fmd_serd_destroy(hdl, cp->mq_serdnm);
3667810STom.Pothier@Sun.COM 	}
3677810STom.Pothier@Sun.COM 	fmd_serd_create(hdl, cp->mq_serdnm, CMD_MQ_SERDN, CMD_MQ_SERDT);
3687810STom.Pothier@Sun.COM 	(void) fmd_serd_record(hdl, cp->mq_serdnm, ep);
3693325Ssd77468 
3703325Ssd77468 	return (cp);
3713325Ssd77468 }
3723325Ssd77468 
3737810STom.Pothier@Sun.COM /* Destroy MQSC tracking block as well as event tracking SERD. */
3747810STom.Pothier@Sun.COM 
3757810STom.Pothier@Sun.COM cmd_mq_t *
mq_destroy(fmd_hdl_t * hdl,cmd_list_t * lp,cmd_mq_t * ip)3767810STom.Pothier@Sun.COM mq_destroy(fmd_hdl_t *hdl, cmd_list_t *lp, cmd_mq_t *ip)
3777810STom.Pothier@Sun.COM {
3787810STom.Pothier@Sun.COM 	cmd_mq_t *jp = cmd_list_next(ip);
379*12467STrang.Do@Sun.COM 	tstamp_t *tsp, *next;
3807810STom.Pothier@Sun.COM 
3817810STom.Pothier@Sun.COM 	if (ip->mq_serdnm != NULL) {
382*12467STrang.Do@Sun.COM 		if (fmd_serd_exists(hdl, ip->mq_serdnm))
3837810STom.Pothier@Sun.COM 			fmd_serd_destroy(hdl, ip->mq_serdnm);
3847810STom.Pothier@Sun.COM 		fmd_hdl_strfree(hdl, ip->mq_serdnm);
3857810STom.Pothier@Sun.COM 		ip->mq_serdnm = NULL;
3867810STom.Pothier@Sun.COM 	}
387*12467STrang.Do@Sun.COM 
388*12467STrang.Do@Sun.COM 	for (tsp = cmd_list_next(&ip->mq_dupce_tstamp); tsp != NULL;
389*12467STrang.Do@Sun.COM 	    tsp = next) {
390*12467STrang.Do@Sun.COM 		next = cmd_list_next(tsp);
391*12467STrang.Do@Sun.COM 		cmd_list_delete(&ip->mq_dupce_tstamp, &tsp->ts_l);
392*12467STrang.Do@Sun.COM 		fmd_hdl_free(hdl, tsp, sizeof (tstamp_t));
393*12467STrang.Do@Sun.COM 	}
394*12467STrang.Do@Sun.COM 
3957810STom.Pothier@Sun.COM 	cmd_list_delete(lp, &ip->mq_l);
3967810STom.Pothier@Sun.COM 	fmd_hdl_free(hdl, ip, sizeof (cmd_mq_t));
3977810STom.Pothier@Sun.COM 
3987810STom.Pothier@Sun.COM 	return (jp);
3997810STom.Pothier@Sun.COM }
4007810STom.Pothier@Sun.COM 
4013325Ssd77468 /*
4023325Ssd77468  * Add an index block for a new CE, sorted
4033325Ssd77468  * a) by ascending unit position
4043325Ssd77468  * b) order of arrival (~= time order)
4053325Ssd77468  */
4063325Ssd77468 
4073325Ssd77468 void
mq_add(fmd_hdl_t * hdl,cmd_dimm_t * dimm,fmd_event_t * ep,uint64_t afar,uint16_t synd,uint64_t now,uint32_t cpuid)4083325Ssd77468 mq_add(fmd_hdl_t *hdl, cmd_dimm_t *dimm, fmd_event_t *ep,
409*12467STrang.Do@Sun.COM     uint64_t afar, uint16_t synd, uint64_t now, uint32_t cpuid)
4103325Ssd77468 {
4113325Ssd77468 	cmd_mq_t *ip, *jp;
4123325Ssd77468 	int cw, unit_position;
4133325Ssd77468 
4143325Ssd77468 	cw = (afar & 0x30) >> 4;		/* 0:3 */
4153325Ssd77468 	if ((unit_position = cmd_synd2upos(synd)) < 0)
4163325Ssd77468 		return;				/* not a CE */
4173325Ssd77468 
4183325Ssd77468 	for (ip = cmd_list_next(&dimm->mq_root[cw]); ip != NULL; ) {
4198016STom.Pothier@Sun.COM 		if (ip->mq_unit_position > unit_position) {
4208016STom.Pothier@Sun.COM 			/* list is in unit position order */
4218016STom.Pothier@Sun.COM 			break;
4228016STom.Pothier@Sun.COM 		} else if (ip->mq_unit_position == unit_position &&
4233325Ssd77468 		    ip->mq_phys_addr == afar) {
4243325Ssd77468 			/*
4253325Ssd77468 			 * Found a duplicate cw, unit_position, and afar.
426*12467STrang.Do@Sun.COM 			 * update the mq_t with the new information
4273325Ssd77468 			 */
428*12467STrang.Do@Sun.COM 			mq_update(hdl, ep, ip, now, cpuid);
429*12467STrang.Do@Sun.COM 			return;
4308016STom.Pothier@Sun.COM 		} else {
4318016STom.Pothier@Sun.COM 			ip = cmd_list_next(ip);
4328016STom.Pothier@Sun.COM 		}
4333325Ssd77468 	}
4348016STom.Pothier@Sun.COM 
435*12467STrang.Do@Sun.COM 	jp = mq_create(hdl, ep, afar, unit_position, now, cpuid);
4363325Ssd77468 	if (ip == NULL)
4373325Ssd77468 		cmd_list_append(&dimm->mq_root[cw], jp);
4383325Ssd77468 	else
4393325Ssd77468 		cmd_list_insert_before(&dimm->mq_root[cw], ip, jp);
4403325Ssd77468 }
4413325Ssd77468 
4423325Ssd77468 /*
4433325Ssd77468  * Prune the MQSC index lists (one for each checkword), by deleting
4443325Ssd77468  * outdated index blocks from each list.
4453325Ssd77468  */
4463325Ssd77468 
4473325Ssd77468 void
mq_prune(fmd_hdl_t * hdl,cmd_dimm_t * dimm,uint64_t now)4483325Ssd77468 mq_prune(fmd_hdl_t *hdl, cmd_dimm_t *dimm, uint64_t now)
4493325Ssd77468 {
4507810STom.Pothier@Sun.COM 	cmd_mq_t *ip;
4513325Ssd77468 	int cw;
4523325Ssd77468 
4533325Ssd77468 	for (cw = 0; cw < CMD_MAX_CKWDS; cw++) {
4543325Ssd77468 		for (ip = cmd_list_next(&dimm->mq_root[cw]); ip != NULL; ) {
4558016STom.Pothier@Sun.COM 			if (ip->mq_tstamp < now - CMD_MQ_TIMELIM) {
4567810STom.Pothier@Sun.COM 				/*
4577810STom.Pothier@Sun.COM 				 * This event has timed out - delete the
4587810STom.Pothier@Sun.COM 				 * mq block as well as serd for the event.
4597810STom.Pothier@Sun.COM 				 */
4607810STom.Pothier@Sun.COM 				ip = mq_destroy(hdl, &dimm->mq_root[cw], ip);
4618016STom.Pothier@Sun.COM 			} else {
4628016STom.Pothier@Sun.COM 				/* tstamp < now - ce_t */
463*12467STrang.Do@Sun.COM 				mq_prune_dup(hdl, ip, now);
4648016STom.Pothier@Sun.COM 				ip = cmd_list_next(ip);
4658016STom.Pothier@Sun.COM 			}
4663325Ssd77468 		} /* per checkword */
4673325Ssd77468 	} /* cw = 0...3 */
4683325Ssd77468 }
4693325Ssd77468 
4703325Ssd77468 /*
4713325Ssd77468  * Check the MQSC index lists (one for each checkword) by making a
47212144SLouis.Tsien@Sun.COM  * complete pass through each list, checking if the criteria for
47312144SLouis.Tsien@Sun.COM  * Rule 4A has been met.  Rule 4A checking is done for each checkword.
4743325Ssd77468  *
4753325Ssd77468  * Rule 4A: fault a DIMM  "whenever Solaris reports two or more CEs from
4763325Ssd77468  * two or more different physical addresses on each of two or more different
4773325Ssd77468  * bit positions from the same DIMM within 72 hours of each other, and all
4783325Ssd77468  * the addresses are in the same relative checkword (that is, the AFARs
4793325Ssd77468  * are all the same modulo 64).  [Note: This means at least 4 CEs; two
4803325Ssd77468  * from one bit position, with unique addresses, and two from another,
4813325Ssd77468  * also with unique addresses, and the lower 6 bits of all the addresses
4823325Ssd77468  * are the same."
4833325Ssd77468  */
4843325Ssd77468 
4853325Ssd77468 void
mq_check(fmd_hdl_t * hdl,cmd_dimm_t * dimm)4863325Ssd77468 mq_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
4873325Ssd77468 {
48812144SLouis.Tsien@Sun.COM 	int upos_pairs, curr_upos, cw, i, j;
4893325Ssd77468 	nvlist_t *flt;
4903325Ssd77468 	typedef struct upos_pair {
4913325Ssd77468 		int upos;
4923325Ssd77468 		cmd_mq_t *mq1;
4933325Ssd77468 		cmd_mq_t *mq2;
4943325Ssd77468 	} upos_pair_t;
4953325Ssd77468 	upos_pair_t upos_array[8]; /* max per cw = 2, * 4 cw's */
4963325Ssd77468 	cmd_mq_t *ip;
4973325Ssd77468 
4988016STom.Pothier@Sun.COM 	/*
4998016STom.Pothier@Sun.COM 	 * Each upos_array[] member represents a pair of CEs for the same
5008016STom.Pothier@Sun.COM 	 * unit position (symbol) which on a sun4u is a bit, and on sun4v
5018016STom.Pothier@Sun.COM 	 * is a (4 bit) nibble.
5028016STom.Pothier@Sun.COM 	 * MQSC rule 4 requires pairs of CEs from the same symbol (same DIMM
5038016STom.Pothier@Sun.COM 	 * for rule 4A, and same DRAM for rule 4B) for a violation - this
5048016STom.Pothier@Sun.COM 	 * is why CE pairs are tracked.
5058016STom.Pothier@Sun.COM 	 */
5063325Ssd77468 	upos_pairs = 0;
5073325Ssd77468 	upos_array[0].mq1 = NULL;
5088016STom.Pothier@Sun.COM 
5098016STom.Pothier@Sun.COM 	/* Loop through all checkwords */
5103325Ssd77468 	for (cw = 0; cw < CMD_MAX_CKWDS; cw++) {
5113325Ssd77468 		i = upos_pairs;
5123325Ssd77468 		curr_upos = -1;
5138016STom.Pothier@Sun.COM 
5148016STom.Pothier@Sun.COM 		/*
5158016STom.Pothier@Sun.COM 		 * mq_root[] is an array of cumulative lists of CEs
5168016STom.Pothier@Sun.COM 		 * indexed by checkword where the list is in unit position
5178016STom.Pothier@Sun.COM 		 * order. Loop through checking for duplicate unit position
5188016STom.Pothier@Sun.COM 		 * entries (filled in at mq_create()).
5198016STom.Pothier@Sun.COM 		 * The upos_array[] is filled in each time a duplicate
5208016STom.Pothier@Sun.COM 		 * unit position is found; the first time through the loop
5218016STom.Pothier@Sun.COM 		 * of a unit position sets curr_upos but does not fill in
5228016STom.Pothier@Sun.COM 		 * upos_array[] until the second symbol is found.
5238016STom.Pothier@Sun.COM 		 */
5243325Ssd77468 		for (ip = cmd_list_next(&dimm->mq_root[cw]); ip != NULL;
5253325Ssd77468 		    ip = cmd_list_next(ip)) {
5268016STom.Pothier@Sun.COM 			if (curr_upos != ip->mq_unit_position) {
5278016STom.Pothier@Sun.COM 				/* Set initial current position */
5283325Ssd77468 				curr_upos = ip->mq_unit_position;
5298016STom.Pothier@Sun.COM 			} else if (i > upos_pairs &&
5308016STom.Pothier@Sun.COM 			    curr_upos == upos_array[i-1].upos) {
5318016STom.Pothier@Sun.COM 				/*
5328016STom.Pothier@Sun.COM 				 * Only keep track of CE pairs; skip
5338016STom.Pothier@Sun.COM 				 * triples, quads, etc...
5348016STom.Pothier@Sun.COM 				 */
5358016STom.Pothier@Sun.COM 				continue;
5368016STom.Pothier@Sun.COM 			} else if (upos_array[i].mq1 == NULL) {
5378016STom.Pothier@Sun.COM 				/*
5388016STom.Pothier@Sun.COM 				 * Have a pair, add to upos_array[].
5398016STom.Pothier@Sun.COM 				 */
5403325Ssd77468 				upos_array[i].upos = curr_upos;
5413325Ssd77468 				upos_array[i].mq1 = cmd_list_prev(ip);
5423325Ssd77468 				upos_array[i].mq2 = ip;
5433325Ssd77468 				upos_array[++i].mq1 = NULL;
5443325Ssd77468 			}
5453325Ssd77468 		}
5468016STom.Pothier@Sun.COM 
5473325Ssd77468 		if (i - upos_pairs >= 2) {
5488016STom.Pothier@Sun.COM 			/* Rule 4A Violation. */
5493325Ssd77468 			flt = cmd_dimm_create_fault(hdl,
5508297STom.Pothier@Sun.COM 			    dimm, "fault.memory.dimm-ue-imminent",
5518297STom.Pothier@Sun.COM 			    CMD_FLTMAXCONF);
5523325Ssd77468 			for (j = upos_pairs; j < i; j++) {
5533325Ssd77468 				fmd_case_add_ereport(hdl,
5543325Ssd77468 				    dimm->dimm_case.cc_cp,
5553325Ssd77468 				    upos_array[j].mq1->mq_ep);
5563325Ssd77468 				fmd_case_add_ereport(hdl,
5573325Ssd77468 				    dimm->dimm_case.cc_cp,
5583325Ssd77468 				    upos_array[j].mq2->mq_ep);
5593325Ssd77468 			}
5604038Stsien 			dimm->dimm_flags |= CMD_MEM_F_FAULTING;
5614038Stsien 			cmd_dimm_dirty(hdl, dimm);
5623325Ssd77468 			fmd_case_add_suspect(hdl, dimm->dimm_case.cc_cp, flt);
5633325Ssd77468 			fmd_case_solve(hdl, dimm->dimm_case.cc_cp);
5643325Ssd77468 			return;
5653325Ssd77468 		}
5664038Stsien 		upos_pairs = i;
5678016STom.Pothier@Sun.COM 		assert(upos_pairs < 8);
5683325Ssd77468 	}
5693325Ssd77468 }
5703325Ssd77468 
571600Stsien /*ARGSUSED*/
572600Stsien cmd_evdisp_t
cmd_ce_common(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,uint64_t afar,uint8_t afar_status,uint16_t synd,uint8_t synd_status,ce_dispact_t type,uint64_t disp,nvlist_t * asru)573600Stsien cmd_ce_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
574600Stsien     const char *class, uint64_t afar, uint8_t afar_status, uint16_t synd,
575600Stsien     uint8_t synd_status, ce_dispact_t type, uint64_t disp, nvlist_t *asru)
576600Stsien {
5774038Stsien 	cmd_dimm_t *dimm;
578600Stsien 	cmd_page_t *page;
579600Stsien 	const char *uuid;
580*12467STrang.Do@Sun.COM 	uint64_t *now;
581*12467STrang.Do@Sun.COM 	uint_t nelem;
582*12467STrang.Do@Sun.COM 	uint32_t cpuid;
583*12467STrang.Do@Sun.COM 	nvlist_t *det;
584*12467STrang.Do@Sun.COM 	uint64_t addr;
585*12467STrang.Do@Sun.COM 	int skip_error = 0;
586600Stsien 
587600Stsien 	if (afar_status != AFLT_STAT_VALID ||
588600Stsien 	    synd_status != AFLT_STAT_VALID)
589600Stsien 		return (CMD_EVD_UNUSED);
590600Stsien 
5913325Ssd77468 	if ((page = cmd_page_lookup(afar)) != NULL &&
5923325Ssd77468 	    page->page_case.cc_cp != NULL &&
5933325Ssd77468 	    fmd_case_solved(hdl, page->page_case.cc_cp))
594600Stsien 		return (CMD_EVD_REDUND);
595600Stsien 
5961186Sayznaga #ifdef sun4u
5971186Sayznaga 	if (cmd_dp_error(hdl) || cmd_dp_fault(hdl, afar)) {
5981186Sayznaga 		CMD_STAT_BUMP(dp_ignored_ce);
5991186Sayznaga 		return (CMD_EVD_UNUSED);
6001186Sayznaga 	}
6011186Sayznaga #endif /* sun4u */
6021186Sayznaga 
603600Stsien 	if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
604600Stsien 		CMD_STAT_BUMP(bad_mem_asru);
6058016STom.Pothier@Sun.COM 		return (CMD_EVD_BAD);
606600Stsien 	}
607600Stsien 
608600Stsien 	if ((dimm = cmd_dimm_lookup(hdl, asru)) == NULL &&
609600Stsien 	    (dimm = cmd_dimm_create(hdl, asru)) == NULL)
610600Stsien 		return (CMD_EVD_UNUSED);
611600Stsien 
6123325Ssd77468 	if (dimm->dimm_case.cc_cp == NULL) {
6133325Ssd77468 		dimm->dimm_case.cc_cp = cmd_case_create(hdl,
6143325Ssd77468 		    &dimm->dimm_header, CMD_PTR_DIMM_CASE, &uuid);
6153325Ssd77468 	}
6163325Ssd77468 
617*12467STrang.Do@Sun.COM 	if (nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, &det) != 0)
618*12467STrang.Do@Sun.COM 		return (CMD_EVD_BAD);
619*12467STrang.Do@Sun.COM 
6203325Ssd77468 	/*
6213325Ssd77468 	 * Add to MQSC correlation lists all CEs which pass validity
6223325Ssd77468 	 * checks above.
623*12467STrang.Do@Sun.COM 	 * Add mq_t when there is no bad r/w or dimm fault.
624*12467STrang.Do@Sun.COM 	 * Always prune the expired mq_t.
6253325Ssd77468 	 */
626*12467STrang.Do@Sun.COM 	skip_error = cmd_dimm_check_symbol_error(dimm, synd);
627*12467STrang.Do@Sun.COM 
628*12467STrang.Do@Sun.COM 	if (nvlist_lookup_uint64_array(nvl,
629*12467STrang.Do@Sun.COM 	    "__tod", &now, &nelem) == 0) {
630*12467STrang.Do@Sun.COM 
631*12467STrang.Do@Sun.COM 		if (!skip_error || !(dimm->dimm_flags & CMD_MEM_F_FAULTING)) {
632*12467STrang.Do@Sun.COM 			if (nvlist_lookup_uint32(det, FM_FMRI_CPU_ID, &cpuid)
633*12467STrang.Do@Sun.COM 			    != 0)
634*12467STrang.Do@Sun.COM 				cpuid = ULONG_MAX;
6353325Ssd77468 
636*12467STrang.Do@Sun.COM 			mq_add(hdl, dimm, ep, afar, synd, *now, cpuid);
637*12467STrang.Do@Sun.COM 		}
638*12467STrang.Do@Sun.COM 
639*12467STrang.Do@Sun.COM 		mq_prune(hdl, dimm, *now);
640*12467STrang.Do@Sun.COM 
641*12467STrang.Do@Sun.COM 		if (!skip_error)
642*12467STrang.Do@Sun.COM 			bad_reader_writer_check(hdl, dimm, det);
643*12467STrang.Do@Sun.COM 
644*12467STrang.Do@Sun.COM 		if (!(dimm->dimm_flags & CMD_MEM_F_FAULTING)) {
6453325Ssd77468 			mq_check(hdl, dimm);
646*12467STrang.Do@Sun.COM 			mq_5b_check(hdl, dimm);
6473325Ssd77468 		}
6483325Ssd77468 	}
6493325Ssd77468 
650600Stsien 	switch (type) {
651600Stsien 	case CE_DISP_UNKNOWN:
652600Stsien 		CMD_STAT_BUMP(ce_unknown);
6534038Stsien 		return (CMD_EVD_UNUSED);
654600Stsien 	case CE_DISP_INTERMITTENT:
655600Stsien 		CMD_STAT_BUMP(ce_interm);
6564038Stsien 		return (CMD_EVD_UNUSED);
657600Stsien 	case CE_DISP_POSS_PERS:
658600Stsien 		CMD_STAT_BUMP(ce_ppersis);
659600Stsien 		break;
660600Stsien 	case CE_DISP_PERS:
661600Stsien 		CMD_STAT_BUMP(ce_persis);
662600Stsien 		break;
663600Stsien 	case CE_DISP_LEAKY:
664600Stsien 		CMD_STAT_BUMP(ce_leaky);
665600Stsien 		break;
666600Stsien 	case CE_DISP_POSS_STICKY:
667600Stsien 	{
668600Stsien 		uchar_t ptnrinfo = CE_XDIAG_PTNRINFO(disp);
669600Stsien 
670600Stsien 		if (CE_XDIAG_TESTVALID(ptnrinfo)) {
671600Stsien 			int ce1 = CE_XDIAG_CE1SEEN(ptnrinfo);
672600Stsien 			int ce2 = CE_XDIAG_CE2SEEN(ptnrinfo);
673600Stsien 
674600Stsien 			if (ce1 && ce2) {
675600Stsien 				/* Should have been CE_DISP_STICKY */
676600Stsien 				return (CMD_EVD_BAD);
677600Stsien 			} else if (ce1) {
678600Stsien 				/* Partner could see and could fix CE */
679600Stsien 				CMD_STAT_BUMP(ce_psticky_ptnrclrd);
680600Stsien 			} else {
681600Stsien 				/* Partner could not see ce1 (ignore ce2) */
682600Stsien 				CMD_STAT_BUMP(ce_psticky_ptnrnoerr);
683600Stsien 			}
684600Stsien 		} else {
685600Stsien 			CMD_STAT_BUMP(ce_psticky_noptnr);
686600Stsien 		}
687600Stsien 		return (CMD_EVD_UNUSED);
688600Stsien 	}
689600Stsien 	case CE_DISP_STICKY:
690600Stsien 		CMD_STAT_BUMP(ce_sticky);
691600Stsien 		break;
692600Stsien 	default:
693600Stsien 		return (CMD_EVD_BAD);
694600Stsien 	}
695600Stsien 
696*12467STrang.Do@Sun.COM 	if (cmd_dimm_check_symbol_error(dimm, synd))
697*12467STrang.Do@Sun.COM 		return (CMD_EVD_REDUND);
698*12467STrang.Do@Sun.COM 
6993325Ssd77468 	if (page == NULL)
7003325Ssd77468 		page = cmd_page_create(hdl, asru, afar);
7013325Ssd77468 
7023325Ssd77468 	if (page->page_case.cc_cp == NULL) {
7033325Ssd77468 		page->page_case.cc_cp = cmd_case_create(hdl,
7043325Ssd77468 		    &page->page_header, CMD_PTR_PAGE_CASE, &uuid);
705600Stsien 	}
706600Stsien 
707600Stsien 	switch (type) {
708600Stsien 	case CE_DISP_POSS_PERS:
709600Stsien 	case CE_DISP_PERS:
710600Stsien 		fmd_hdl_debug(hdl, "adding %sPersistent event to CE serd "
711600Stsien 		    "engine\n", type == CE_DISP_POSS_PERS ? "Possible-" : "");
712600Stsien 
7133325Ssd77468 		if (page->page_case.cc_serdnm == NULL) {
7143325Ssd77468 			page->page_case.cc_serdnm = cmd_page_serdnm_create(hdl,
7153325Ssd77468 			    "page", page->page_physbase);
716600Stsien 
7173325Ssd77468 			fmd_serd_create(hdl, page->page_case.cc_serdnm,
718600Stsien 			    fmd_prop_get_int32(hdl, "ce_n"),
719600Stsien 			    fmd_prop_get_int64(hdl, "ce_t"));
720600Stsien 		}
721600Stsien 
7223325Ssd77468 		if (fmd_serd_record(hdl, page->page_case.cc_serdnm, ep) ==
723600Stsien 		    FMD_B_FALSE)
724600Stsien 				return (CMD_EVD_OK); /* engine hasn't fired */
725600Stsien 
7263325Ssd77468 		fmd_hdl_debug(hdl, "ce page serd fired\n");
7273325Ssd77468 		fmd_case_add_serd(hdl, page->page_case.cc_cp,
7283325Ssd77468 		    page->page_case.cc_serdnm);
7293325Ssd77468 		fmd_serd_reset(hdl, page->page_case.cc_serdnm);
730600Stsien 		break;	/* to retire */
731600Stsien 
732600Stsien 	case CE_DISP_LEAKY:
733600Stsien 	case CE_DISP_STICKY:
7343325Ssd77468 		fmd_case_add_ereport(hdl, page->page_case.cc_cp, ep);
735600Stsien 		break;	/* to retire */
736600Stsien 	}
737600Stsien 
738*12467STrang.Do@Sun.COM 	if (page->page_flags & CMD_MEM_F_FAULTING ||
739*12467STrang.Do@Sun.COM 	    fmd_nvl_fmri_unusable(hdl, page->page_asru_nvl))
740*12467STrang.Do@Sun.COM 		return (CMD_EVD_OK);
741*12467STrang.Do@Sun.COM 
742*12467STrang.Do@Sun.COM 	/*
743*12467STrang.Do@Sun.COM 	 * convert a unhashed address to hashed address
744*12467STrang.Do@Sun.COM 	 */
745*12467STrang.Do@Sun.COM 	cmd_to_hashed_addr(&addr, afar, class);
746*12467STrang.Do@Sun.COM 
747*12467STrang.Do@Sun.COM 	if (afar > dimm->dimm_phys_addr_hi)
748*12467STrang.Do@Sun.COM 		dimm->dimm_phys_addr_hi = addr;
749*12467STrang.Do@Sun.COM 
750*12467STrang.Do@Sun.COM 	if (afar < dimm->dimm_phys_addr_low)
751*12467STrang.Do@Sun.COM 		dimm->dimm_phys_addr_low = addr;
752*12467STrang.Do@Sun.COM 
753600Stsien 	dimm->dimm_nretired++;
754600Stsien 	dimm->dimm_retstat.fmds_value.ui64++;
755600Stsien 	cmd_dimm_dirty(hdl, dimm);
756600Stsien 
7571186Sayznaga 	cmd_page_fault(hdl, asru, cmd_dimm_fru(dimm), ep, afar);
758600Stsien 	ce_thresh_check(hdl, dimm);
759600Stsien 
760600Stsien 	return (CMD_EVD_OK);
761600Stsien }
762600Stsien 
7631752Sgavinm /*
7641752Sgavinm  * Solve a bank case with suspect "fault.memory.bank".  The caller must
7651752Sgavinm  * have populated bank->bank_case.cc_cp and is also responsible for adding
7661752Sgavinm  * associated ereport(s) to that case.
7671752Sgavinm  */
7681186Sayznaga void
cmd_bank_fault(fmd_hdl_t * hdl,cmd_bank_t * bank)769600Stsien cmd_bank_fault(fmd_hdl_t *hdl, cmd_bank_t *bank)
770600Stsien {
7711752Sgavinm 	fmd_case_t *cp = bank->bank_case.cc_cp;
772600Stsien 	nvlist_t *flt;
773600Stsien 
774600Stsien 	if (bank->bank_flags & CMD_MEM_F_FAULTING)
775600Stsien 		return; /* Only complain once per bank */
776600Stsien 
777600Stsien 	bank->bank_flags |= CMD_MEM_F_FAULTING;
778600Stsien 	cmd_bank_dirty(hdl, bank);
779600Stsien 
7803325Ssd77468 #ifdef	sun4u
781600Stsien 	flt = cmd_bank_create_fault(hdl, bank, "fault.memory.bank",
782600Stsien 	    CMD_FLTMAXCONF);
783600Stsien 	fmd_case_add_suspect(hdl, cp, flt);
7843325Ssd77468 #else /* sun4v */
7853325Ssd77468 	{
7863325Ssd77468 		cmd_bank_memb_t *d;
7873325Ssd77468 
7883325Ssd77468 		/* create separate fault for each dimm in bank */
7893325Ssd77468 
7903325Ssd77468 		for (d = cmd_list_next(&bank->bank_dimms);
7913325Ssd77468 		    d != NULL; d = cmd_list_next(d)) {
7923325Ssd77468 			flt = cmd_dimm_create_fault(hdl, d->bm_dimm,
7933325Ssd77468 			    "fault.memory.bank", CMD_FLTMAXCONF);
7943325Ssd77468 			fmd_case_add_suspect(hdl, cp, flt);
7953325Ssd77468 		}
7963325Ssd77468 	}
7973325Ssd77468 #endif /* sun4u */
798600Stsien 	fmd_case_solve(hdl, cp);
799600Stsien }
800600Stsien 
801600Stsien /*ARGSUSED*/
802600Stsien cmd_evdisp_t
cmd_ue_common(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,uint64_t afar,uint8_t afar_status,uint16_t synd,uint8_t synd_status,ce_dispact_t type,uint64_t disp,nvlist_t * asru)803600Stsien cmd_ue_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
804600Stsien     const char *class, uint64_t afar, uint8_t afar_status, uint16_t synd,
805600Stsien     uint8_t synd_status, ce_dispact_t type, uint64_t disp, nvlist_t *asru)
806600Stsien {
807600Stsien 	cmd_page_t *page;
808600Stsien 	cmd_bank_t *bank;
809600Stsien 	cmd_cpu_t *cpu;
810600Stsien 
8115737Smb91622 #ifdef sun4u
8125737Smb91622 	/*
8135737Smb91622 	 * Note: Currently all sun4u processors using this code share
8145737Smb91622 	 * L2 and L3 cache at CMD_CPU_LEVEL_CORE.
8155737Smb91622 	 */
8165737Smb91622 	cpu = cmd_cpu_lookup_from_detector(hdl, nvl, class,
8175737Smb91622 	    CMD_CPU_LEVEL_CORE);
8185737Smb91622 #else /* sun4v */
8192400Stsien 	cpu = cmd_cpu_lookup_from_detector(hdl, nvl, class,
8202400Stsien 	    CMD_CPU_LEVEL_THREAD);
8215737Smb91622 #endif /* sun4u */
822600Stsien 
823600Stsien 	if (cpu == NULL) {
824600Stsien 		fmd_hdl_debug(hdl, "cmd_ue_common: cpu not found\n");
825600Stsien 		return (CMD_EVD_UNUSED);
826600Stsien 	}
827600Stsien 
828600Stsien 	/*
829600Stsien 	 * The following code applies only to sun4u, because sun4u does
830600Stsien 	 * not poison data in L2 cache resulting from the fetch of a
831600Stsien 	 * memory UE.
832600Stsien 	 */
833600Stsien 
834600Stsien #ifdef sun4u
835600Stsien 	if (afar_status != AFLT_STAT_VALID) {
836600Stsien 		/*
837600Stsien 		 * Had this report's AFAR been valid, it would have
838600Stsien 		 * contributed an address to the UE cache.  We don't
839600Stsien 		 * know what the AFAR would have been, and thus we can't
840600Stsien 		 * add anything to the cache.  If a xxU is caused by
841600Stsien 		 * this UE, we won't be able to detect it, and will thus
842600Stsien 		 * erroneously offline the CPU.  To prevent this
843600Stsien 		 * situation, we need to assume that all xxUs generated
844600Stsien 		 * through the next E$ flush are attributable to the UE.
845600Stsien 		 */
846600Stsien 		cmd_cpu_uec_set_allmatch(hdl, cpu);
847600Stsien 	} else {
848600Stsien 		cmd_cpu_uec_add(hdl, cpu, afar);
849600Stsien 	}
850600Stsien #endif /* sun4u */
851600Stsien 
8522996Skd93003 	if (synd_status != AFLT_STAT_VALID) {
8532996Skd93003 		fmd_hdl_debug(hdl, "cmd_ue_common: syndrome not valid\n");
8542996Skd93003 		return (CMD_EVD_UNUSED);
8552996Skd93003 	}
8562996Skd93003 
8572996Skd93003 	if (cmd_mem_synd_check(hdl, afar, afar_status, synd, synd_status,
8582996Skd93003 	    cpu) == CMD_EVD_UNUSED)
8592996Skd93003 		return (CMD_EVD_UNUSED);
8602996Skd93003 
861600Stsien 	if (afar_status != AFLT_STAT_VALID)
862600Stsien 		return (CMD_EVD_UNUSED);
863600Stsien 
8643325Ssd77468 	if ((page = cmd_page_lookup(afar)) != NULL &&
8653325Ssd77468 	    page->page_case.cc_cp != NULL &&
8663325Ssd77468 	    fmd_case_solved(hdl, page->page_case.cc_cp))
867600Stsien 		return (CMD_EVD_REDUND);
868600Stsien 
869600Stsien 	if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
870600Stsien 		CMD_STAT_BUMP(bad_mem_asru);
871600Stsien 		return (NULL);
872600Stsien 	}
873600Stsien 
874600Stsien 	if ((bank = cmd_bank_lookup(hdl, asru)) == NULL &&
875600Stsien 	    (bank = cmd_bank_create(hdl, asru)) == NULL)
876600Stsien 		return (CMD_EVD_UNUSED);
877600Stsien 
8786828Stsien #ifdef sun4v
8796828Stsien 	{
8806828Stsien 		nvlist_t *fmri;
8816828Stsien 		char **snarray;
8826828Stsien 		unsigned int i, n;
8836828Stsien 
8846828Stsien 		/*
8856828Stsien 		 * 1: locate the array of serial numbers inside the bank asru.
8866828Stsien 		 * 2: for each serial #, lookup its mem: FMRI in libtopo
8876828Stsien 		 * 3: ensure that each DIMM's FMRI is on bank's dimmlist
8886828Stsien 		 */
8896828Stsien 
8906828Stsien 		if (nvlist_lookup_string_array(asru,
8916828Stsien 		    FM_FMRI_MEM_SERIAL_ID, &snarray, &n) != 0)
8926828Stsien 			fmd_hdl_abort(hdl, "Cannot locate serial #s for bank");
8936828Stsien 
8946828Stsien 		for (i = 0; i < n; i++) {
8956828Stsien 			fmri = cmd_find_dimm_by_sn(hdl, FM_FMRI_SCHEME_MEM,
8966828Stsien 			    snarray[i]);
8976828Stsien 			/*
8986828Stsien 			 * If dimm structure doesn't already exist for
8996828Stsien 			 * each dimm, create and link to bank.
9006828Stsien 			 */
9016828Stsien 			if (cmd_dimm_lookup(hdl, fmri) == NULL)
9026828Stsien 				(void) cmd_dimm_create(hdl, fmri);
9036828Stsien 			nvlist_free(fmri);
9046828Stsien 		}
9056828Stsien 	}
9066828Stsien #endif /* sun4v */
9076828Stsien 
908600Stsien 	if (bank->bank_case.cc_cp == NULL) {
909600Stsien 		const char *uuid;
910600Stsien 		bank->bank_case.cc_cp = cmd_case_create(hdl, &bank->bank_header,
911600Stsien 		    CMD_PTR_BANK_CASE, &uuid);
912600Stsien 	}
913600Stsien 
9141186Sayznaga #ifdef sun4u
9151186Sayznaga 	if (cmd_dp_error(hdl)) {
9161186Sayznaga 		CMD_STAT_BUMP(dp_deferred_ue);
9171186Sayznaga 		cmd_dp_page_defer(hdl, asru, ep, afar);
9181186Sayznaga 		return (CMD_EVD_OK);
9191186Sayznaga 	} else if (cmd_dp_fault(hdl, afar)) {
9201186Sayznaga 		CMD_STAT_BUMP(dp_ignored_ue);
9211186Sayznaga 		return (CMD_EVD_UNUSED);
9221186Sayznaga 	}
9231186Sayznaga #endif /* sun4u */
9241186Sayznaga 
925600Stsien 	fmd_case_add_ereport(hdl, bank->bank_case.cc_cp, ep);
926600Stsien 
927600Stsien 	bank->bank_nretired++;
928600Stsien 	bank->bank_retstat.fmds_value.ui64++;
929600Stsien 	cmd_bank_dirty(hdl, bank);
930600Stsien 
931600Stsien 	cmd_page_fault(hdl, bank->bank_asru_nvl, cmd_bank_fru(bank), ep, afar);
932600Stsien 	cmd_bank_fault(hdl, bank);
933600Stsien 
934600Stsien 	return (CMD_EVD_OK);
935600Stsien }
936600Stsien 
937600Stsien void
cmd_dimm_close(fmd_hdl_t * hdl,void * arg)938600Stsien cmd_dimm_close(fmd_hdl_t *hdl, void *arg)
939600Stsien {
940600Stsien 	cmd_dimm_destroy(hdl, arg);
941600Stsien }
942600Stsien 
943600Stsien void
cmd_bank_close(fmd_hdl_t * hdl,void * arg)944600Stsien cmd_bank_close(fmd_hdl_t *hdl, void *arg)
945600Stsien {
946600Stsien 	cmd_bank_destroy(hdl, arg);
947600Stsien }
948