xref: /onnv-gate/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_cpu.h (revision 6429:a90bb8316257)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #ifndef _CMD_CPU_H
27 #define	_CMD_CPU_H
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 /*
32  * Each CPU of interest has a cmd_cpu_t structure.  CPUs become of interest when
33  * they are the focus of ereports, or when they detect UEs.  CPUs may be the
34  * target of several different kinds of ereport, each of which is tracked
35  * differently.  cpu_cases lists the types of cases that can be open against a
36  * given CPU.  The life of a CPU is complicated by the fact that xxCs and xxUs
37  * received by the DE may in fact be side-effects of earlier UEs, xxCs, or xxUs.
38  * Causes of side-effects, and actions taken to resolve them, can be found below
39  * and in cmd_memerr.h.
40  *
41  * Data structures:
42  *      ________                                   CMD_PTR_CPU_ICACHE
43  *     /        \       ,--------.                 CMD_PTR_CPU_DCACHE
44  *     |CPU     | <---- |case_ptr| (one or more of CMD_PTR_CPU_PCACHE         )
45  *     |        |       `--------'                 CMD_PTR_CPU_ITLB
46  *     |,-------|       ,-------.                  CMD_PTR_CPU_DTLB
47  *     ||asru   | ----> |fmri_t |                  CMD_PTR_CPU_L2DATA
48  *     |:-------|       :-------:                  CMD_PTR_CPU_L2DATA_UERETRY
49  *     ||fru    | ----> |fmri_t |                  CMD_PTR_CPU_L2TAG
50  *     |`-------|       `-------'                  CMD_PTR_CPU_L3DATA
51  *     |        |       ,---------.                CMD_PTR_CPU_L3DATA_UERETRY
52  *     | uec    | ----> |UE cache |                CMD_PTR_CPU_L3TAG
53  *     \________/       `---------'                CMD_PTR_CPU_FPU
54  *						   CMD_PTR_CPU_IREG
55  *						   CMD_PTR_CPU_FREG
56  *						   CMD_PTR_CPU_MAU
57  *						   CMD_PTR_CPU_L2CTL
58  *
59  *      ________
60  *     /        \       ,--------.
61  *     | xr     | <---- |case_ptr| (CMD_PTR_XR_WAITER)
62  *     |        |       `--------'
63  *     |,-------|       ,-------.
64  *     ||rsrc   | ----> |fmri_t |
65  *     |`-------|       `-------'
66  *     | cpu    | ----> detecting CPU
67  *     \________/
68  *
69  * Data structure	P?  Case- Notes
70  *                          Rel?
71  * ----------------	--- ----- --------------------------------------
72  * cmd_cpu_t		Yes No    Name is derived from CPU ID ("cpu_%d")
73  * cmd_case_ptr_t	Yes Yes   Name is case's UUID
74  * cpu_asru (fmri_t)	Yes No    Name is derived from CPU ID ("cpu_asru_%d")
75  * cpu_fru (fmri_t)	Yes No    Name is derived from CPU ID ("cpu_fru_%d")
76  * cpu_uec		Yes No    Name is derived from CPU ID ("cpu_uec_%d")
77  * cmd_xr_t		Yes Yes   Name is `redelivery'
78  * xr_rsrc (fmri_t)     Yes No    Name is derived from case's UUID ("%s_rsrc")
79  */
80 
81 #include <cmd.h>
82 #include <cmd_state.h>
83 #include <cmd_fmri.h>
84 
85 #ifdef __cplusplus
86 extern "C" {
87 #endif
88 
89 #define	CPU_FRU_FMRI		FM_FMRI_SCHEME_HC":///" \
90     FM_FMRI_LEGACY_HC"="
91 
92 #define	BK_LFUFAULT_CERT	50
93 
94 typedef struct cmd_cpu cmd_cpu_t;
95 
96 typedef enum cmd_cpu_type {
97 	CPU_ULTRASPARC_III = 1,
98 	CPU_ULTRASPARC_IIIplus,
99 	CPU_ULTRASPARC_IIIi,
100 	CPU_ULTRASPARC_IV,
101 	CPU_ULTRASPARC_IVplus,
102 	CPU_ULTRASPARC_IIIiplus,
103 	CPU_ULTRASPARC_T1,
104 	CPU_SPARC64_VI,
105 	CPU_SPARC64_VII,
106 	CPU_ULTRASPARC_T2,
107 	CPU_ULTRASPARC_T2plus
108 } cmd_cpu_type_t;
109 
110 typedef struct cmd_cpu_cases {
111 	cmd_case_t cpuc_icache;		/* All I$ errors (IPE, IDSPE, etc) */
112 	cmd_case_t cpuc_dcache;		/* All D$ errors (DPE, DDSPE, etc) */
113 	cmd_case_t cpuc_pcache;		/* All P$ errors (PDSPE) */
114 	cmd_case_t cpuc_itlb;		/* ITLB errors (ITLBPE) */
115 	cmd_case_t cpuc_dtlb;		/* DTLB errors (DTLBPE) */
116 	cmd_case_t cpuc_l2data;		/* All correctable L2$ data errors */
117 	cmd_case_t cpuc_l2tag;		/* All correctable L2$ tag errors */
118 	cmd_case_t cpuc_l3data;		/* All correctable L3$ data errors */
119 	cmd_case_t cpuc_l3tag;		/* All correctable L3$ tag errors */
120 	cmd_case_t cpuc_fpu;		/* FPU errors */
121 	cmd_case_t cpuc_ireg;		/* Integer reg errors (IRC, IRU) */
122 	cmd_case_t cpuc_freg;		/* Floatpnt reg errors (frc, fru) */
123 	cmd_case_t cpuc_mau;		/* Modular arith errors (MAU) */
124 	cmd_case_t cpuc_l2ctl;		/* L2$ directory, VUAD parity */
125 	cmd_case_t cpuc_misc_regs;	/* Scratchpad array (SCA) */
126 					/* Tick compare (TC) */
127 					/* Store buffer (SBD) */
128 					/* Trap stack array errors (TSA) */
129 	cmd_case_t cpuc_lfu;		/* Coherency link error (LFU) */
130 #ifdef sun4u
131 	cmd_case_t cpuc_opl_invsfsr;	/* Olympus-C cpu inv-sfsr errors */
132 	cmd_case_t cpuc_oplue_detcpu;	/* Olympus-C cpu det. ue (eid=CPU) */
133 	cmd_case_t cpuc_oplue_detio;	/* Olympus-C io det. ue (eid=CPU) */
134 	cmd_case_t cpuc_opl_mtlb;	/* Olympus-C mtlb errors */
135 	cmd_case_t cpuc_opl_tlbp;	/* Olympus-C tlbp errors */
136 	cmd_case_t cpuc_opl_inv_urg;	/* Olympus-C inv-urg invalid urgent */
137 	cmd_case_t cpuc_opl_cre;	/* Olympus-C cre urgent errors */
138 	cmd_case_t cpuc_opl_tsb_ctx;	/* Olympus-C tsb_ctx urgent errors */
139 	cmd_case_t cpuc_opl_tsbp;	/* Olympus-C tsbp urgent errors */
140 	cmd_case_t cpuc_opl_pstate;	/* Olympus-C pstate urgent errors */
141 	cmd_case_t cpuc_opl_tstate;	/* Olympus-C tstate urgent errors */
142 	cmd_case_t cpuc_opl_iug_f;	/* Olympus-C iug_f urgent errors */
143 	cmd_case_t cpuc_opl_iug_r;	/* Olympus-C iug_r urgent errors */
144 	cmd_case_t cpuc_opl_sdc;	/* Olympus-C sdc urgent errors */
145 	cmd_case_t cpuc_opl_wdt;	/* Olympus-C wdt urgent errors */
146 	cmd_case_t cpuc_opl_dtlb;	/* Olympus-C dtlb urgent errors */
147 	cmd_case_t cpuc_opl_itlb;	/* Olympus-C itlb urgent errors */
148 	cmd_case_t cpuc_opl_core_err;	/* Olympus-C core-err urgent errors */
149 	cmd_case_t cpuc_opl_dae;	/* Olympus-C dae urgent errors */
150 	cmd_case_t cpuc_opl_iae;	/* Olympus-C iae urgent errors */
151 	cmd_case_t cpuc_opl_uge;	/* Olympus-C uge urgent errors */
152 #endif	/* sun4u */
153 } cmd_cpu_cases_t;
154 
155 /*
156  * The UE cache.  We actually have two UE caches - the current one and the old
157  * one.  When it's time to flush the UE cache, we move the current UE cache to
158  * the old position and flush the E$.  Then, we schedule the removal of the old
159  * UE cache.  This allows a) xxUs triggered by the flush to match against the
160  * old cache, while b) still allowing new UEs to be added to the current UE
161  * cache.  UE matches will always search in both caches (if present), but
162  * additions will only end up in the current cache.  We go to all of this
163  * effort because the cost of a missed ereport (discarding due to a false match
164  * in the cache) is much less than that of a missed match.  In the latter case,
165  * the CPU will be erroneously offlined.
166  *
167  * A special case is triggered if we see a UE with a not valid AFAR.  Without
168  * the AFAR, we aren't able to properly match subsequent xxU's.  As a result,
169  * we need to throw the cache into all-match mode, wherein all subsequent match
170  * attempts will succeed until the UE cache is flushed.
171  */
172 
173 #define	CPU_UEC_F_ALLMATCH	0x1	/* all-match mode active */
174 
175 typedef struct cmd_cpu_uec {
176 	uint64_t *uec_cache;		/* The UE cache */
177 	uint_t uec_nent;		/* Number of allocated slots in cache */
178 	uint_t uec_flags;		/* CPU_UEC_F_* */
179 	char uec_bufname[CMD_BUFNMLEN];	/* Name of buffer used for cache */
180 } cmd_cpu_uec_t;
181 
182 extern const char *cmd_cpu_type2name(fmd_hdl_t *, cmd_cpu_type_t);
183 extern void cmd_cpu_uec_add(fmd_hdl_t *, cmd_cpu_t *, uint64_t);
184 extern int cmd_cpu_uec_match(cmd_cpu_t *, uint64_t);
185 extern void cmd_cpu_uec_clear(fmd_hdl_t *, cmd_cpu_t *);
186 extern void cmd_cpu_uec_set_allmatch(fmd_hdl_t *, cmd_cpu_t *);
187 
188 /*
189  * Certain types of xxC and xxU can trigger other types as side-effects.  These
190  * secondary ereports need to be discarded, as treating them as legitimate
191  * ereports in their own right will cause erroneous diagnosis.  As an example
192  * (see cmd_xxcu_trains for more), an L2$ UCC will usually trigger an L2$ WDC
193  * resulting from the trap handler's flushing of the L2$.  If we treat both as
194  * legitimate, we'll end up adding two ereports to the SERD engine,
195  * significantly cutting the threshold for retiring the CPU.
196  *
197  * Our saving grace is the fact that the side-effect ereports will have the same
198  * ENA as the primary.  As such, we can keep track of groups of ereports by ENA.
199  * These groups, which we'll call trains, can then be matched against a list of
200  * known trains.  The list (an array of cmd_xxcu_train_t structures) has both a
201  * description of the composition of the train and an indication as to which of
202  * the received ereports is the primary.
203  *
204  * The cmd_xxcu_trw_t is used to gather the members of the train.  When the
205  * first member comes in, we allocate a trw, recording the ENA of the ereport,
206  * as well as noting its class in trw_mask.  We then reschedule the delivery of
207  * the ereport for some configurable time in the future, trusting that all
208  * members of the train will have arrived by that time.  Subsequent ereports in
209  * the same train match the recorded ENA, and add themselves to the mask.
210  * When the first ereport is redelivered, trw_mask is used to determine whether
211  * or not a train has been seen.  An exact match is required.  If a match is
212  * made, the ereport indicated as the primary cause is used for diagnosis.
213  */
214 
215 #define	CMD_TRW_F_DELETING	0x1	/* reclaiming events */
216 #define	CMD_TRW_F_CAUSESEEN	0x2	/* cause of train already processed */
217 #define	CMD_TRW_F_GCSEEN	0x4	/* seen by GC, erased next time */
218 
219 typedef struct cmd_xxcu_trw {
220 	uint64_t trw_ena;	/* the ENA for this group of ereports */
221 	uint64_t trw_afar;	/* the AFAR for this group of ereports */
222 	cmd_errcl_t trw_mask;	/* ereports seen thus far with this ENA */
223 	uint16_t trw_cpuid;	/* CPU to which this watcher belongs */
224 	uint8_t	 trw_ref;	/* number of ereports with this ENA */
225 	uint8_t	 trw_flags;	/* CMD_TRW_F_* */
226 	uint32_t trw_pad;
227 } cmd_xxcu_trw_t;
228 
229 extern cmd_xxcu_trw_t *cmd_trw_lookup(uint64_t, uint8_t, uint64_t);
230 extern cmd_xxcu_trw_t *cmd_trw_alloc(uint64_t, uint64_t);
231 extern void cmd_trw_restore(fmd_hdl_t *);
232 extern void cmd_trw_write(fmd_hdl_t *);
233 extern void cmd_trw_ref(fmd_hdl_t *, cmd_xxcu_trw_t *, cmd_errcl_t);
234 extern void cmd_trw_deref(fmd_hdl_t *, cmd_xxcu_trw_t *);
235 
236 extern cmd_errcl_t cmd_xxcu_train_match(cmd_errcl_t);
237 
238 /*
239  * We don't have access to ereport nvlists when they are redelivered via timer.
240  * As such, we have to retrieve everything we might need for diagnosis when we
241  * first receive the ereport.  The retrieved information is stored in the
242  * cmd_xr_t, which is persisted.
243  */
244 
245 typedef struct cmd_xr cmd_xr_t;
246 
247 /*
248  * xr_hdlr can't be persisted, so we use these in xr_hdlrid to indicate the
249  * handler to be used.  xr_hdlr is then updated so it can be used directly.
250  */
251 #define	CMD_XR_HDLR_XXC		1
252 #define	CMD_XR_HDLR_XXU		2
253 #define	CMD_XR_HDLR_NOP		3
254 
255 typedef void cmd_xr_hdlr_f(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *);
256 
257 /*
258  * For sun4v, the size of xr_synd is expanded to 32 bits in order to
259  * accomodate the Niagara L2 syndrome (4x7 bits).
260  */
261 
262 struct cmd_xr {
263 	cmd_list_t xr_list;
264 	id_t xr_id;		/* ID of timer used for redelivery */
265 	cmd_cpu_t *xr_cpu;	/* Detecting CPU, recalc'd from cpuid */
266 	uint32_t xr_cpuid;	/* ID of detecting CPU */
267 	uint64_t xr_ena;	/* ENA from ereport */
268 	uint64_t xr_afar;	/* AFAR from ereport nvlist */
269 #ifdef sun4u
270 	uint16_t xr_synd;	/* syndrome from ereport nvlist */
271 #else /* sun4u */
272 	uint32_t xr_synd;	/* for Niagara, enlarged to 32 bits */
273 #endif /* sun4u */
274 	uint8_t xr_afar_status;	/* AFAR status from ereport nvlist */
275 	uint8_t xr_synd_status;	/* syndrome status from ereport nvlist */
276 	cmd_fmri_t xr_rsrc;	/* resource from ereport nvlist */
277 	cmd_errcl_t xr_clcode;	/* CMD_ERRCL_* for this ereport */
278 	cmd_xr_hdlr_f *xr_hdlr;	/* handler, recalc'd from hdlrid on restart */
279 	uint_t xr_hdlrid;	/* CMD_XR_HDLR_*, used for recalc of hdlr */
280 	fmd_case_t *xr_case;	/* Throwaway case used to track redelivery */
281 	uint_t xr_ref;		/* Number of references to this struct */
282 #ifdef sun4u
283 	uint64_t xr_afsr;	/* AFSR from ereport nvlist */
284 	uint8_t  xr_num_ways;   /* Number of Cache ways reporting from nvlist */
285 	uint32_t xr_error_way;  /* The way from the ereport nvlist payload */
286 	uint64_t xr_error_tag;  /* The tag from the ereport nvlist payload */
287 	uint32_t xr_error_index; /* the index from the ereport payload */
288 	uint64_t *xr_cache_data; /* The cache data */
289 	nvlist_t *xr_detector_nvlist; /* The detecting resource */
290 #endif
291 };
292 
293 #define	xr_rsrc_nvl		xr_rsrc.fmri_nvl
294 
295 extern cmd_xr_t *cmd_xr_create(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
296     cmd_cpu_t *, cmd_errcl_t);
297 extern cmd_evdisp_t cmd_xr_reschedule(fmd_hdl_t *, cmd_xr_t *, uint_t);
298 extern void cmd_xr_deref(fmd_hdl_t *, cmd_xr_t *);
299 extern void cmd_xr_write(fmd_hdl_t *, cmd_xr_t *);
300 
301 extern void cmd_xxc_resolve(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *);
302 extern void cmd_xxu_resolve(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *);
303 extern void cmd_nop_resolve(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *);
304 extern cmd_evdisp_t cmd_xxcu_initial(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
305     const char *, cmd_errcl_t,  uint_t);
306 
307 /*
308  * The master structure containing or referencing all of the state for a given
309  * CPU.
310  */
311 
312 /*
313  * We periodically flush the E$, thus allowing us to flush the UE cache (see
314  * above for a description of the UE cache).  In particular, we flush it
315  * whenever we see a UE with a non-valid AFAR.  To keep from overflushing the
316  * CPU, we cap the number of flushes that we'll do in response to UEs with
317  * non-valid AFARs.  The cap is the number of permitted flushes per GC/restart
318  * cycle, and was determined arbitrarily.
319  */
320 #define	CPU_UEC_FLUSH_MAX	3
321 
322 /*
323  * The CPU structure started life without a version number.  Making things more
324  * complicated, the version number in the new struct occupies the space used for
325  * cpu_cpuid in the non-versioned struct.  We therefore have to use somewhat
326  * unorthodox version numbers to distinguish between the two types of struct
327  * (pre- and post-versioning) -- version numbers that can't be mistaken for
328  * CPUIDs.  Our version numbers, therefore, will be negative.
329  *
330  * For future expansion, the version member must always stay where it is.  At
331  * some point in the future, when more structs get versions, the version member
332  * should move into the cmd_header_t.
333  */
334 #define	CPU_MKVERSION(version)	((uint_t)(0 - (version)))
335 
336 #define	CMD_CPU_VERSION_1	CPU_MKVERSION(1)	/* -1 */
337 #define	CMD_CPU_VERSION_2	CPU_MKVERSION(2)	/* -2 */
338 #define	CMD_CPU_VERSION_3	CPU_MKVERSION(3)	/* -3 */
339 #define	CMD_CPU_VERSION		CMD_CPU_VERSION_3
340 
341 #define	CMD_CPU_VERSIONED(cpu)	((int)(cpu)->cpu_version < 0)
342 
343 #define	CMD_CPU_F_DELETING	0x1
344 
345 typedef struct cmd_cpu_0 {
346 	cmd_header_t cpu0_header;	/* Nodetype must be CMD_NT_CPU */
347 	uint32_t cpu0_cpuid;		/* Logical ID for this CPU */
348 	cmd_cpu_type_t cpu0_type;	/* CPU model */
349 	fmd_case_t *cpu0_cases[4];	/* v0 had embedded case_t w/4 cases */
350 	uint8_t cpu0_faulting;		/* Set if fault has been issued */
351 	cmd_fmri_t cpu0_asru;		/* ASRU for this CPU */
352 	cmd_fmri_t cpu0_fru;		/* FRU for this CPU */
353 	cmd_cpu_uec_t cpu0_uec;		/* UE cache */
354 	cmd_cpu_uec_t cpu0_olduec;	/* To-be-flushed UE cache */
355 	id_t cpu0_uec_flush;		/* Timer ID for UE cache flush */
356 	uint_t cpu0_uec_nflushes;	/* # of flushes since last restart/GC */
357 	cmd_list_t cpu0_xxu_retries;	/* List of pending xxU retries */
358 } cmd_cpu_0_t;
359 
360 typedef struct cmd_cpu_1 {
361 	cmd_header_t cpu1_header;	/* Nodetype must be CMD_NT_CPU */
362 	uint_t cpu1_version;		/* struct version - must follow hdr */
363 	uint32_t cpu1_cpuid;		/* Logical ID for this CPU */
364 	cmd_cpu_type_t cpu1_type;	/* CPU model */
365 	uintptr_t *cpu1_cases;		/* v1 had a pointer to a case array */
366 	uint8_t cpu1_faulting;		/* Set if fault has been issued */
367 	cmd_fmri_t cpu1_asru;		/* ASRU for this CPU */
368 	cmd_fmri_t cpu1_fru;		/* FRU for this CPU */
369 	cmd_cpu_uec_t cpu1_uec;		/* UE cache */
370 	cmd_cpu_uec_t cpu1_olduec;	/* To-be-flushed UE cache */
371 	id_t cpu1_uec_flush;		/* Timer ID for UE cache flush */
372 	uint_t cpu1_uec_nflushes;	/* # of flushes since last restart/GC */
373 	cmd_list_t cpu1_xxu_retries;	/* List of pending xxU retries */
374 } cmd_cpu_1_t;
375 
376 typedef struct cmd_cpu_2 {
377 	cmd_header_t cpu2_header;	/* Nodetype must be CMD_NT_CPU */
378 	uint_t cpu2_version;		/* struct version - must follow hdr */
379 	uint32_t cpu2_cpuid;		/* Logical ID for this CPU */
380 	cmd_cpu_type_t cpu2_type;	/* CPU model */
381 	uint8_t cpu2_faulting;		/* Set if fault has been issued */
382 	cmd_fmri_t cpu2_asru;		/* ASRU for this CPU */
383 	cmd_fmri_t cpu2_fru;		/* FRU for this CPU */
384 	cmd_cpu_uec_t cpu2_uec;		/* UE cache */
385 	cmd_cpu_uec_t cpu2_olduec;	/* To-be-flushed UE cache */
386 } cmd_cpu_2_t;
387 
388 /* Portion of the cpu structure which must be persisted */
389 typedef struct cmd_cpu_pers {
390 	cmd_header_t cpup_header;	/* Nodetype must be CMD_NT_CPU */
391 	uint_t cpup_version;		/* struct version - must follow hdr */
392 	uint32_t cpup_cpuid;		/* Logical ID for this CPU */
393 	cmd_cpu_type_t cpup_type;	/* CPU model */
394 	uint8_t cpup_faulting;		/* Set if fault has been issued */
395 	uint8_t cpup_level;		/* cpu group level - 0 == thread */
396 	cmd_fmri_t cpup_asru;		/* ASRU for this CPU */
397 	cmd_fmri_t cpup_fru;		/* FRU for this CPU */
398 	cmd_cpu_uec_t cpup_uec;		/* UE cache */
399 	cmd_cpu_uec_t cpup_olduec;	/* To-be-flushed UE cache */
400 } cmd_cpu_pers_t;
401 
402 /* Persistent and dynamic CPU data */
403 struct cmd_cpu {
404 	cmd_cpu_pers_t cpu_pers;
405 	cmd_cpu_cases_t cpu_cases;
406 	id_t cpu_uec_flush;		/* Timer ID for UE cache flush */
407 	uint_t cpu_uec_nflushes;	/* # of flushes since last restart/GC */
408 	cmd_list_t cpu_xxu_retries;	/* List of pending xxU retries */
409 	uint_t cpu_flags;
410 	cmd_list_t cpu_Lxcaches;	/* List of Lxcache state structures */
411 	fmd_stat_t Lxcache_creat;	/* num of Lxcache states created */
412 };
413 
414 #define	CMD_CPU_MAXSIZE \
415 	MAX(MAX(sizeof (cmd_cpu_0_t), sizeof (cmd_cpu_1_t)), \
416 	    MAX(sizeof (cmd_cpu_2_t), sizeof (cmd_cpu_pers_t)))
417 #define	CMD_CPU_MINSIZE \
418 	MIN(MIN(sizeof (cmd_cpu_0_t), sizeof (cmd_cpu_1_t)), \
419 	    MIN(sizeof (cmd_cpu_2_t), sizeof (cmd_cpu_pers_t)))
420 
421 #define	cpu_header		cpu_pers.cpup_header
422 #define	cpu_nodetype		cpu_pers.cpup_header.hdr_nodetype
423 #define	cpu_bufname		cpu_pers.cpup_header.hdr_bufname
424 #define	cpu_version		cpu_pers.cpup_version
425 #define	cpu_cpuid		cpu_pers.cpup_cpuid
426 #define	cpu_type		cpu_pers.cpup_type
427 #define	cpu_faulting		cpu_pers.cpup_faulting
428 #define	cpu_level		cpu_pers.cpup_level
429 #define	cpu_asru		cpu_pers.cpup_asru
430 #define	cpu_fru			cpu_pers.cpup_fru
431 #define	cpu_uec			cpu_pers.cpup_uec
432 #define	cpu_olduec		cpu_pers.cpup_olduec
433 #define	cpu_icache		cpu_cases.cpuc_icache
434 #define	cpu_dcache		cpu_cases.cpuc_dcache
435 #define	cpu_pcache		cpu_cases.cpuc_pcache
436 #define	cpu_itlb		cpu_cases.cpuc_itlb
437 #define	cpu_dtlb		cpu_cases.cpuc_dtlb
438 #define	cpu_l2data		cpu_cases.cpuc_l2data
439 #define	cpu_l2tag		cpu_cases.cpuc_l2tag
440 #define	cpu_l3data		cpu_cases.cpuc_l3data
441 #define	cpu_l3tag		cpu_cases.cpuc_l3tag
442 #define	cpu_fpu			cpu_cases.cpuc_fpu
443 #define	cpu_ireg 		cpu_cases.cpuc_ireg
444 #define	cpu_freg		cpu_cases.cpuc_freg
445 #define	cpu_mau			cpu_cases.cpuc_mau
446 #define	cpu_l2ctl		cpu_cases.cpuc_l2ctl
447 #define	cpu_misc_regs		cpu_cases.cpuc_misc_regs
448 #define	cpu_lfu			cpu_cases.cpuc_lfu
449 #ifdef sun4u
450 #define	cpu_opl_invsfsr		cpu_cases.cpuc_opl_invsfsr
451 #define	cpu_oplue_detcpu	cpu_cases.cpuc_oplue_detcpu
452 #define	cpu_oplue_detio		cpu_cases.cpuc_oplue_detio
453 #define	cpu_opl_mtlb		cpu_cases.cpuc_opl_mtlb
454 #define	cpu_opl_tlbp		cpu_cases.cpuc_opl_tlbp
455 #define	cpu_opl_inv_urg		cpu_cases.cpuc_opl_inv_urg
456 #define	cpu_opl_cre		cpu_cases.cpuc_opl_cre
457 #define	cpu_opl_tsb_ctx		cpu_cases.cpuc_opl_tsb_ctx
458 #define	cpu_opl_tsbp		cpu_cases.cpuc_opl_tsbp
459 #define	cpu_opl_pstate		cpu_cases.cpuc_opl_pstate
460 #define	cpu_opl_tstate		cpu_cases.cpuc_opl_tstate
461 #define	cpu_opl_iug_f		cpu_cases.cpuc_opl_iug_f
462 #define	cpu_opl_iug_r		cpu_cases.cpuc_opl_iug_r
463 #define	cpu_opl_sdc		cpu_cases.cpuc_opl_sdc
464 #define	cpu_opl_wdt		cpu_cases.cpuc_opl_wdt
465 #define	cpu_opl_dtlb		cpu_cases.cpuc_opl_dtlb
466 #define	cpu_opl_itlb		cpu_cases.cpuc_opl_itlb
467 #define	cpu_opl_core_err	cpu_cases.cpuc_opl_core_err
468 #define	cpu_opl_dae		cpu_cases.cpuc_opl_dae
469 #define	cpu_opl_iae		cpu_cases.cpuc_opl_iae
470 #define	cpu_opl_uge		cpu_cases.cpuc_opl_uge
471 #endif	/* sun4u */
472 
473 #define	cpu_asru_nvl		cpu_asru.fmri_nvl
474 #define	cpu_fru_nvl		cpu_fru.fmri_nvl
475 
476 /*
477  * L2$ and L3$ Data errors
478  *
479  *          SERD name
480  *   Type   (if any)   Fault
481  *  ------ ----------- -------------------------------
482  *   xxC   l2cachedata fault.cpu.<cputype>.l2cachedata
483  *   xxU        -      fault.cpu.<cputype>.l2cachedata
484  *  L3_xxC l3cachedata fault.cpu.<cputype>.l3cachedata
485  *  L3_xxU      -      fault.cpu.<cputype>.l3cachedata
486  *
487  * NOTE: For the purposes of the discussion below, xxC and xxU refer to both
488  *       L2$ and L3$ data errors.
489  *
490  * These ereports will be dropped if (among other things) they are side-effects
491  * of UEs (xxUs only) or other xxCs or xxUs.  Whenever UEs are detected, they
492  * are added to a per-CPU cache.  xxUs are then compared to this cache.  If a
493  * xxU's AFAR refers to an address which recently saw a UE, the xxU is dropped,
494  * as it was most likely caused by the UE.  When multiple xxCs and xxUs are seen
495  * with the same ENA, all save one are generally side-effects.  We track these
496  * groups (referred to as trains), matching them against a premade list.  If one
497  * of the trains matches, we drop all but the primary, which is indicated in the
498  * list.
499  *
500  * The expected resolution of l2cachedata and l3cachedata faults is the
501  * disabling of the indicated CPU.
502  */
503 extern cmd_evdisp_t cmd_xxc(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
504     const char *, cmd_errcl_t);
505 extern cmd_evdisp_t cmd_xxu(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
506     const char *, cmd_errcl_t);
507 
508 /*
509  * As of Niagara-2, we ignore writeback (ldwc, ldwu) errors.  Since these were
510  * the only defined follow-on errors for sun4v trains, sun4v L2 cache data
511  * errors no longer need to use the train mechanism.
512  */
513 
514 extern cmd_evdisp_t cmd_l2c(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
515     const char *, cmd_errcl_t);
516 extern cmd_evdisp_t cmd_l2u(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
517     const char *, cmd_errcl_t);
518 
519 /*
520  * Common Errdata structure for SERD engines
521  */
522 typedef struct errdata {
523 	cmd_serd_t *ed_serd;
524 	const char *ed_fltnm;
525 	const cmd_ptrsubtype_t ed_pst;
526 } errdata_t;
527 
528 /*
529  * L2$ and L3$ Tag errors
530  *
531  *           SERD name
532  *   Type    (if any)   Fault
533  *  ------- ----------- -------------------------------
534  *   TxCE   l2cachetag  fault.cpu.<cputype>.l2cachetag
535  *  L3_THCE l3cachetag  fault.cpu.<cputype>.l3cachetag
536  *    LTC   l2cachetag	fault.cpu.<cputype>.l2cachetag
537  *
538  * We'll never see the uncorrectable Tag errors - they'll cause the machine to
539  * reset, and we'll be ne'er the wiser.
540  *
541  * The expected resolution of l2cachetag and l3cachetag faults is the disabling
542  * of the indicated CPU.
543  */
544 extern cmd_evdisp_t cmd_txce(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
545     const char *, cmd_errcl_t);
546 
547 extern cmd_evdisp_t cmd_l3_thce(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
548     const char *, cmd_errcl_t);
549 
550 /*
551  * L1$ errors
552  *
553  *          SERD name
554  *   Type   (if any)   Fault
555  *  ------- --------- -------------------------------
556  *   IPE     icache   fault.cpu.<cputype>.icache
557  *   IxSPE   icache   fault.cpu.<cputype>.icache
558  *   DPE     dcache   fault.cpu.<cputype>.dcache
559  *   DxSPE   dcache   fault.cpu.<cputype>.dcache
560  *   PDSPE   pcache   fault.cpu.<cputype>.pcache
561  *
562  * The I$, D$, and P$ are clean, and thus have no uncorrectable errors.
563  *
564  * The expected resolution of icache, dcache, and pcache faults is the disabling
565  * of the indicated CPU.
566  */
567 extern cmd_evdisp_t cmd_icache(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
568     const char *, cmd_errcl_t);
569 extern cmd_evdisp_t cmd_dcache(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
570     const char *, cmd_errcl_t);
571 extern cmd_evdisp_t cmd_pcache(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
572     const char *, cmd_errcl_t);
573 
574 /*
575  * TLB errors
576  *
577  *         SERD name
578  *   Type  (if any)   Fault
579  *  ------ --------- -------------------------------
580  *  ITLBPE   itlb    fault.cpu.<cputype>.itlb
581  *  DTLBPE   dtlb    fault.cpu.<cputype>.dtlb
582  *
583  * The expected resolution of itlb and dtlb faults is the disabling of the
584  * indicated CPU.
585  */
586 extern cmd_evdisp_t cmd_itlb(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
587     const char *, cmd_errcl_t);
588 extern cmd_evdisp_t cmd_dtlb(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
589     const char *, cmd_errcl_t);
590 
591 extern void cmd_cpuerr_close(fmd_hdl_t *, void *);
592 
593 /*
594  * FPU errors
595  *
596  *         SERD name
597  *   Type  (if any)   Fault
598  *  ------ --------- -------------------------------
599  *   FPU       -     fault.cpu.<cputype>.fpu
600  *
601  * The expected resolution of FPU faults is the disabling of the indicated CPU.
602  */
603 extern cmd_evdisp_t cmd_fpu(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
604     const char *, cmd_errcl_t);
605 
606 
607 
608 /*
609  * FPU (FP-Scrubber) errors
610  *
611  *         SERD name
612  *   Type  (if any)   Fault
613  *  ------ --------- -------------------------------
614  *   FPU       -     fault.cpu.<cputype>.fpu
615  *
616  * The expected resolution of FPU faults is the disabling of the CPU
617  * indicted in the resource FMRI.
618  */
619 extern cmd_evdisp_t cmd_fps(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
620     const char *, cmd_errcl_t);
621 
622 
623 
624 
625 
626 /*
627  * ireg errors
628  *
629  *         SERD name
630  *   Type  (if any)   Fault
631  *  ------ --------- -------------------------------
632  *   IRC     ireg    fault.cpu.<cputype>.ireg
633  *   IRU      -				 "
634  *
635  * The expected resolution of ireg faults is the disabling of the indicated CPU.
636  */
637 extern cmd_evdisp_t cmd_irc(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
638     const char *, cmd_errcl_t);
639 extern cmd_evdisp_t cmd_iru(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
640     const char *, cmd_errcl_t);
641 
642 /*
643  * freg errors
644  *
645  *         SERD name
646  *   Type  (if any)   Fault
647  *  ------ --------- -------------------------------
648  *   FRC     freg    fault.cpu.ultraSPARC-T1.frc
649  *   FRU      -                           " .fru
650  *
651  * The expected resolution of freg faults is the repair of the indicated CPU.
652  */
653 extern cmd_evdisp_t cmd_frc(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
654     const char *, cmd_errcl_t);
655 extern cmd_evdisp_t cmd_fru(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
656     const char *, cmd_errcl_t);
657 
658 /*
659  * MAU errors
660  *
661  *         SERD name
662  *   Type  (if any)   Fault
663  *  ------ --------- -------------------------------
664  *   MAU     mau    fault.cpu.<cputype>.mau
665  *
666  * The expected resolution of mau faults is the repair of the indicated CPU.
667  */
668 extern cmd_evdisp_t cmd_mau(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
669     const char *, cmd_errcl_t);
670 
671 /*
672  * L2CTL errors
673  *
674  *         SERD name
675  *   Type  (if any)   Fault
676  *  ------ --------- -------------------------------
677  *  L2CTL     -     fault.cpu.<cputype>.l2ctl
678  *
679  * The expected resolution of l2ctl faults is the repair of the indicated CPU.
680  */
681 extern cmd_evdisp_t cmd_l2ctl(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
682     const char *, cmd_errcl_t);
683 
684 /*
685  * SBD (Storage Buffer Data) errors
686  * SCA (Scratchpath Array) erros
687  * TC (Tick compare) errors
688  * TSA (Trap stack Array) errors
689  *
690  *         SERD name
691  *   Type  (if any)   Fault
692  *  ------ --------- -------------------------------
693  *   SBDC     misc_regs    fault.cpu.<cputype>.misc_regs
694  *   SBDU
695  *   SCAC, SCAU
696  *   TCC, TCU
697  *   TSAC, TSAU
698  *
699  * The expected resolution of misc_regs faults is the repair of
700  * the indicated CPU.
701  */
702 extern cmd_evdisp_t cmd_miscregs_ce(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
703     const char *, cmd_errcl_t);
704 extern cmd_evdisp_t cmd_miscregs_ue(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
705     const char *, cmd_errcl_t);
706 
707 extern cmd_evdisp_t cmd_miscregs_train(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
708     const char *, cmd_errcl_t);
709 
710 /*
711  * Type                                          Fault
712  * ---------------------------------------------------------------------
713  * LFU-RTF   uncorrectable link retrain fail error    fault.cpu.T2plus.lfu-u
714  * LFU-TTO   uncorrectable training timeout error
715  * LFU-CTO   uncorrectable config timeout error
716  * LFU-MLF   uncorrectable multi lanes link fail error
717  * LFU-SLF   correctable single lane failover	      fault.cpu.T2plus.lfu-f
718  *
719  * The expected resolution of lfu faults is the repair of the indicated CPU.
720  */
721 extern cmd_evdisp_t cmd_lfu_ue(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
722     const char *, cmd_errcl_t);
723 extern cmd_evdisp_t cmd_lfu_ce(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
724     const char *, cmd_errcl_t);
725 /*
726  * Type                                          Fault
727  * ---------------------------------------------------------------------
728  * Coherency link protocol errors
729  * to        Transaction timed out  		fault.cpu.T2plus.lfu-p
730  * frack     Invalid or redundant request ack
731  * fsr       Invalid or redundant snoop response
732  * fdr       Invalid or redundant data return
733  * snptyp    Invalid snoop type received from
734  *           coherency link
735  *
736  * The expected resolution of lfu faults is the repair of the indicated CPU.
737  */
738 extern cmd_evdisp_t cmd_lfu_pe(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
739     const char *, cmd_errcl_t);
740 
741 /*
742  * CPUs are described by FMRIs.  This routine will retrieve the CPU state
743  * structure (creating a new one if necessary) described by the detector
744  * FMRI in the passed ereport.
745  */
746 extern cmd_cpu_t *cmd_cpu_lookup_from_detector(fmd_hdl_t *, nvlist_t *,
747     const char *, uint8_t);
748 
749 extern char *cmd_cpu_getfrustr(fmd_hdl_t *, cmd_cpu_t *);
750 extern char *cmd_cpu_getpartstr(fmd_hdl_t *, cmd_cpu_t *);
751 
752 extern char *cmd_cpu_getserialstr(fmd_hdl_t *, cmd_cpu_t *);
753 extern nvlist_t *cmd_cpu_mkfru(fmd_hdl_t *, char *, char *, char *);
754 
755 extern cmd_cpu_t *cmd_cpu_lookup(fmd_hdl_t *, nvlist_t *, const char *,
756     uint8_t);
757 
758 extern void cmd_cpu_create_faultlist(fmd_hdl_t *, fmd_case_t *, cmd_cpu_t *,
759     const char *, nvlist_t *, uint_t);
760 
761 extern cmd_cpu_t *cmd_restore_cpu_only(fmd_hdl_t *, fmd_case_t *, char *);
762 extern void cmd_cpu_destroy(fmd_hdl_t *, cmd_cpu_t *);
763 extern void *cmd_cpu_restore(fmd_hdl_t *, fmd_case_t *, cmd_case_ptr_t *);
764 extern void cmd_cpu_validate(fmd_hdl_t *);
765 extern void cmd_cpu_timeout(fmd_hdl_t *, id_t, void *);
766 extern void cmd_cpu_gc(fmd_hdl_t *);
767 extern void cmd_cpu_fini(fmd_hdl_t *hdl);
768 extern char *cmd_cpu_serdnm_create(fmd_hdl_t *, cmd_cpu_t *, const char *);
769 extern nvlist_t *cmd_cpu_fmri_create(uint32_t, uint8_t);
770 
771 extern uint32_t cmd_cpu2core(uint32_t, cmd_cpu_type_t, uint8_t);
772 
773 #define	CMD_CPU_LEVEL_THREAD		0
774 #define	CMD_CPU_LEVEL_CORE		1
775 #define	CMD_CPU_LEVEL_CHIP		2
776 #define	CMD_CPU_STAT_BUMP(cpu, name)    cpu->name.fmds_value.ui64++
777 
778 typedef enum {
779     CMD_CPU_FAM_UNSUPPORTED,
780     CMD_CPU_FAM_CHEETAH,
781     CMD_CPU_FAM_NIAGARA,
782     CMD_CPU_FAM_SPARC64
783 } cpu_family_t;
784 
785 typedef struct faminfo {
786 	cpu_family_t fam_value;
787 	boolean_t ecache_flush_needed;
788 } faminfo_t;
789 
790 extern cpu_family_t cmd_cpu_check_support(void);
791 extern boolean_t cmd_cpu_ecache_support(void);
792 
793 extern int cmd_xr_fill(fmd_hdl_t *, nvlist_t *, cmd_xr_t *, cmd_errcl_t);
794 extern void cmd_fill_errdata(cmd_errcl_t, cmd_cpu_t *, cmd_case_t **,
795     const errdata_t **);
796 extern cmd_xxcu_trw_t *cmd_trw_lookup(uint64_t, uint8_t, uint64_t);
797 extern cmd_evdisp_t cmd_nop_train(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
798     const char *, cmd_errcl_t);
799 extern cmd_errcl_t cmd_train_match(cmd_errcl_t, cmd_errcl_t);
800 extern int cmd_afar_status_check(uint8_t, cmd_errcl_t);
801 
802 #ifdef sun4u
803 extern int cmd_cpu_synd_check(uint16_t, cmd_errcl_t clcode);
804 #else /* sun4u */
805 extern int cmd_cpu_synd_check(uint32_t, cmd_errcl_t clcode);
806 #endif /* sun4u */
807 
808 extern int cmd_afar_valid(fmd_hdl_t *hdl, nvlist_t *nvl, cmd_errcl_t,
809     uint64_t *afar);
810 
811 #ifdef __cplusplus
812 }
813 #endif
814 
815 #endif /* _CMD_CPU_H */
816