1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #ifndef _CMD_CPU_H 27 #define _CMD_CPU_H 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 /* 32 * Each CPU of interest has a cmd_cpu_t structure. CPUs become of interest when 33 * they are the focus of ereports, or when they detect UEs. CPUs may be the 34 * target of several different kinds of ereport, each of which is tracked 35 * differently. cpu_cases lists the types of cases that can be open against a 36 * given CPU. The life of a CPU is complicated by the fact that xxCs and xxUs 37 * received by the DE may in fact be side-effects of earlier UEs, xxCs, or xxUs. 38 * Causes of side-effects, and actions taken to resolve them, can be found below 39 * and in cmd_memerr.h. 40 * 41 * Data structures: 42 * ________ CMD_PTR_CPU_ICACHE 43 * / \ ,--------. CMD_PTR_CPU_DCACHE 44 * |CPU | <---- |case_ptr| (one or more of CMD_PTR_CPU_PCACHE ) 45 * | | `--------' CMD_PTR_CPU_ITLB 46 * |,-------| ,-------. CMD_PTR_CPU_DTLB 47 * ||asru | ----> |fmri_t | CMD_PTR_CPU_L2DATA 48 * |:-------| :-------: CMD_PTR_CPU_L2DATA_UERETRY 49 * ||fru | ----> |fmri_t | CMD_PTR_CPU_L2TAG 50 * |`-------| `-------' CMD_PTR_CPU_L3DATA 51 * | | ,---------. CMD_PTR_CPU_L3DATA_UERETRY 52 * | uec | ----> |UE cache | CMD_PTR_CPU_L3TAG 53 * \________/ `---------' CMD_PTR_CPU_FPU 54 * CMD_PTR_CPU_IREG 55 * CMD_PTR_CPU_FREG 56 * CMD_PTR_CPU_MAU 57 * CMD_PTR_CPU_L2CTL 58 * 59 * ________ 60 * / \ ,--------. 61 * | xr | <---- |case_ptr| (CMD_PTR_XR_WAITER) 62 * | | `--------' 63 * |,-------| ,-------. 64 * ||rsrc | ----> |fmri_t | 65 * |`-------| `-------' 66 * | cpu | ----> detecting CPU 67 * \________/ 68 * 69 * Data structure P? Case- Notes 70 * Rel? 71 * ---------------- --- ----- -------------------------------------- 72 * cmd_cpu_t Yes No Name is derived from CPU ID ("cpu_%d") 73 * cmd_case_ptr_t Yes Yes Name is case's UUID 74 * cpu_asru (fmri_t) Yes No Name is derived from CPU ID ("cpu_asru_%d") 75 * cpu_fru (fmri_t) Yes No Name is derived from CPU ID ("cpu_fru_%d") 76 * cpu_uec Yes No Name is derived from CPU ID ("cpu_uec_%d") 77 * cmd_xr_t Yes Yes Name is `redelivery' 78 * xr_rsrc (fmri_t) Yes No Name is derived from case's UUID ("%s_rsrc") 79 */ 80 81 #include <cmd.h> 82 #include <cmd_state.h> 83 #include <cmd_fmri.h> 84 85 #ifdef __cplusplus 86 extern "C" { 87 #endif 88 89 #define CPU_FRU_FMRI FM_FMRI_SCHEME_HC":///" \ 90 FM_FMRI_LEGACY_HC"=" 91 92 #define BK_LFUFAULT_CERT 50 93 94 typedef struct cmd_cpu cmd_cpu_t; 95 96 typedef enum cmd_cpu_type { 97 CPU_ULTRASPARC_III = 1, 98 CPU_ULTRASPARC_IIIplus, 99 CPU_ULTRASPARC_IIIi, 100 CPU_ULTRASPARC_IV, 101 CPU_ULTRASPARC_IVplus, 102 CPU_ULTRASPARC_IIIiplus, 103 CPU_ULTRASPARC_T1, 104 CPU_SPARC64_VI, 105 CPU_SPARC64_VII, 106 CPU_ULTRASPARC_T2, 107 CPU_ULTRASPARC_T2plus 108 } cmd_cpu_type_t; 109 110 typedef struct cmd_cpu_cases { 111 cmd_case_t cpuc_icache; /* All I$ errors (IPE, IDSPE, etc) */ 112 cmd_case_t cpuc_dcache; /* All D$ errors (DPE, DDSPE, etc) */ 113 cmd_case_t cpuc_pcache; /* All P$ errors (PDSPE) */ 114 cmd_case_t cpuc_itlb; /* ITLB errors (ITLBPE) */ 115 cmd_case_t cpuc_dtlb; /* DTLB errors (DTLBPE) */ 116 cmd_case_t cpuc_l2data; /* All correctable L2$ data errors */ 117 cmd_case_t cpuc_l2tag; /* All correctable L2$ tag errors */ 118 cmd_case_t cpuc_l3data; /* All correctable L3$ data errors */ 119 cmd_case_t cpuc_l3tag; /* All correctable L3$ tag errors */ 120 cmd_case_t cpuc_fpu; /* FPU errors */ 121 cmd_case_t cpuc_ireg; /* Integer reg errors (IRC, IRU) */ 122 cmd_case_t cpuc_freg; /* Floatpnt reg errors (frc, fru) */ 123 cmd_case_t cpuc_mau; /* Modular arith errors (MAU) */ 124 cmd_case_t cpuc_l2ctl; /* L2$ directory, VUAD parity */ 125 cmd_case_t cpuc_misc_regs; /* Scratchpad array (SCA) */ 126 /* Tick compare (TC) */ 127 /* Store buffer (SBD) */ 128 /* Trap stack array errors (TSA) */ 129 cmd_case_t cpuc_lfu; /* Coherency link error (LFU) */ 130 #ifdef sun4u 131 cmd_case_t cpuc_opl_invsfsr; /* Olympus-C cpu inv-sfsr errors */ 132 cmd_case_t cpuc_oplue_detcpu; /* Olympus-C cpu det. ue (eid=CPU) */ 133 cmd_case_t cpuc_oplue_detio; /* Olympus-C io det. ue (eid=CPU) */ 134 cmd_case_t cpuc_opl_mtlb; /* Olympus-C mtlb errors */ 135 cmd_case_t cpuc_opl_tlbp; /* Olympus-C tlbp errors */ 136 cmd_case_t cpuc_opl_inv_urg; /* Olympus-C inv-urg invalid urgent */ 137 cmd_case_t cpuc_opl_cre; /* Olympus-C cre urgent errors */ 138 cmd_case_t cpuc_opl_tsb_ctx; /* Olympus-C tsb_ctx urgent errors */ 139 cmd_case_t cpuc_opl_tsbp; /* Olympus-C tsbp urgent errors */ 140 cmd_case_t cpuc_opl_pstate; /* Olympus-C pstate urgent errors */ 141 cmd_case_t cpuc_opl_tstate; /* Olympus-C tstate urgent errors */ 142 cmd_case_t cpuc_opl_iug_f; /* Olympus-C iug_f urgent errors */ 143 cmd_case_t cpuc_opl_iug_r; /* Olympus-C iug_r urgent errors */ 144 cmd_case_t cpuc_opl_sdc; /* Olympus-C sdc urgent errors */ 145 cmd_case_t cpuc_opl_wdt; /* Olympus-C wdt urgent errors */ 146 cmd_case_t cpuc_opl_dtlb; /* Olympus-C dtlb urgent errors */ 147 cmd_case_t cpuc_opl_itlb; /* Olympus-C itlb urgent errors */ 148 cmd_case_t cpuc_opl_core_err; /* Olympus-C core-err urgent errors */ 149 cmd_case_t cpuc_opl_dae; /* Olympus-C dae urgent errors */ 150 cmd_case_t cpuc_opl_iae; /* Olympus-C iae urgent errors */ 151 cmd_case_t cpuc_opl_uge; /* Olympus-C uge urgent errors */ 152 #endif /* sun4u */ 153 } cmd_cpu_cases_t; 154 155 /* 156 * The UE cache. We actually have two UE caches - the current one and the old 157 * one. When it's time to flush the UE cache, we move the current UE cache to 158 * the old position and flush the E$. Then, we schedule the removal of the old 159 * UE cache. This allows a) xxUs triggered by the flush to match against the 160 * old cache, while b) still allowing new UEs to be added to the current UE 161 * cache. UE matches will always search in both caches (if present), but 162 * additions will only end up in the current cache. We go to all of this 163 * effort because the cost of a missed ereport (discarding due to a false match 164 * in the cache) is much less than that of a missed match. In the latter case, 165 * the CPU will be erroneously offlined. 166 * 167 * A special case is triggered if we see a UE with a not valid AFAR. Without 168 * the AFAR, we aren't able to properly match subsequent xxU's. As a result, 169 * we need to throw the cache into all-match mode, wherein all subsequent match 170 * attempts will succeed until the UE cache is flushed. 171 */ 172 173 #define CPU_UEC_F_ALLMATCH 0x1 /* all-match mode active */ 174 175 typedef struct cmd_cpu_uec { 176 uint64_t *uec_cache; /* The UE cache */ 177 uint_t uec_nent; /* Number of allocated slots in cache */ 178 uint_t uec_flags; /* CPU_UEC_F_* */ 179 char uec_bufname[CMD_BUFNMLEN]; /* Name of buffer used for cache */ 180 } cmd_cpu_uec_t; 181 182 extern const char *cmd_cpu_type2name(fmd_hdl_t *, cmd_cpu_type_t); 183 extern void cmd_cpu_uec_add(fmd_hdl_t *, cmd_cpu_t *, uint64_t); 184 extern int cmd_cpu_uec_match(cmd_cpu_t *, uint64_t); 185 extern void cmd_cpu_uec_clear(fmd_hdl_t *, cmd_cpu_t *); 186 extern void cmd_cpu_uec_set_allmatch(fmd_hdl_t *, cmd_cpu_t *); 187 188 /* 189 * Certain types of xxC and xxU can trigger other types as side-effects. These 190 * secondary ereports need to be discarded, as treating them as legitimate 191 * ereports in their own right will cause erroneous diagnosis. As an example 192 * (see cmd_xxcu_trains for more), an L2$ UCC will usually trigger an L2$ WDC 193 * resulting from the trap handler's flushing of the L2$. If we treat both as 194 * legitimate, we'll end up adding two ereports to the SERD engine, 195 * significantly cutting the threshold for retiring the CPU. 196 * 197 * Our saving grace is the fact that the side-effect ereports will have the same 198 * ENA as the primary. As such, we can keep track of groups of ereports by ENA. 199 * These groups, which we'll call trains, can then be matched against a list of 200 * known trains. The list (an array of cmd_xxcu_train_t structures) has both a 201 * description of the composition of the train and an indication as to which of 202 * the received ereports is the primary. 203 * 204 * The cmd_xxcu_trw_t is used to gather the members of the train. When the 205 * first member comes in, we allocate a trw, recording the ENA of the ereport, 206 * as well as noting its class in trw_mask. We then reschedule the delivery of 207 * the ereport for some configurable time in the future, trusting that all 208 * members of the train will have arrived by that time. Subsequent ereports in 209 * the same train match the recorded ENA, and add themselves to the mask. 210 * When the first ereport is redelivered, trw_mask is used to determine whether 211 * or not a train has been seen. An exact match is required. If a match is 212 * made, the ereport indicated as the primary cause is used for diagnosis. 213 */ 214 215 #define CMD_TRW_F_DELETING 0x1 /* reclaiming events */ 216 #define CMD_TRW_F_CAUSESEEN 0x2 /* cause of train already processed */ 217 #define CMD_TRW_F_GCSEEN 0x4 /* seen by GC, erased next time */ 218 219 typedef struct cmd_xxcu_trw { 220 uint64_t trw_ena; /* the ENA for this group of ereports */ 221 uint64_t trw_afar; /* the AFAR for this group of ereports */ 222 cmd_errcl_t trw_mask; /* ereports seen thus far with this ENA */ 223 uint16_t trw_cpuid; /* CPU to which this watcher belongs */ 224 uint8_t trw_ref; /* number of ereports with this ENA */ 225 uint8_t trw_flags; /* CMD_TRW_F_* */ 226 uint32_t trw_pad; 227 } cmd_xxcu_trw_t; 228 229 extern cmd_xxcu_trw_t *cmd_trw_lookup(uint64_t, uint8_t, uint64_t); 230 extern cmd_xxcu_trw_t *cmd_trw_alloc(uint64_t, uint64_t); 231 extern void cmd_trw_restore(fmd_hdl_t *); 232 extern void cmd_trw_write(fmd_hdl_t *); 233 extern void cmd_trw_ref(fmd_hdl_t *, cmd_xxcu_trw_t *, cmd_errcl_t); 234 extern void cmd_trw_deref(fmd_hdl_t *, cmd_xxcu_trw_t *); 235 236 extern cmd_errcl_t cmd_xxcu_train_match(cmd_errcl_t); 237 238 /* 239 * We don't have access to ereport nvlists when they are redelivered via timer. 240 * As such, we have to retrieve everything we might need for diagnosis when we 241 * first receive the ereport. The retrieved information is stored in the 242 * cmd_xr_t, which is persisted. 243 */ 244 245 typedef struct cmd_xr cmd_xr_t; 246 247 /* 248 * xr_hdlr can't be persisted, so we use these in xr_hdlrid to indicate the 249 * handler to be used. xr_hdlr is then updated so it can be used directly. 250 */ 251 #define CMD_XR_HDLR_XXC 1 252 #define CMD_XR_HDLR_XXU 2 253 #define CMD_XR_HDLR_NOP 3 254 255 typedef void cmd_xr_hdlr_f(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *); 256 257 /* 258 * For sun4v, the size of xr_synd is expanded to 32 bits in order to 259 * accomodate the Niagara L2 syndrome (4x7 bits). 260 */ 261 262 struct cmd_xr { 263 cmd_list_t xr_list; 264 id_t xr_id; /* ID of timer used for redelivery */ 265 cmd_cpu_t *xr_cpu; /* Detecting CPU, recalc'd from cpuid */ 266 uint32_t xr_cpuid; /* ID of detecting CPU */ 267 uint64_t xr_ena; /* ENA from ereport */ 268 uint64_t xr_afar; /* AFAR from ereport nvlist */ 269 #ifdef sun4u 270 uint16_t xr_synd; /* syndrome from ereport nvlist */ 271 #else /* sun4u */ 272 uint32_t xr_synd; /* for Niagara, enlarged to 32 bits */ 273 #endif /* sun4u */ 274 uint8_t xr_afar_status; /* AFAR status from ereport nvlist */ 275 uint8_t xr_synd_status; /* syndrome status from ereport nvlist */ 276 cmd_fmri_t xr_rsrc; /* resource from ereport nvlist */ 277 cmd_errcl_t xr_clcode; /* CMD_ERRCL_* for this ereport */ 278 cmd_xr_hdlr_f *xr_hdlr; /* handler, recalc'd from hdlrid on restart */ 279 uint_t xr_hdlrid; /* CMD_XR_HDLR_*, used for recalc of hdlr */ 280 fmd_case_t *xr_case; /* Throwaway case used to track redelivery */ 281 uint_t xr_ref; /* Number of references to this struct */ 282 #ifdef sun4u 283 uint64_t xr_afsr; /* AFSR from ereport nvlist */ 284 uint8_t xr_num_ways; /* Number of Cache ways reporting from nvlist */ 285 uint32_t xr_error_way; /* The way from the ereport nvlist payload */ 286 uint64_t xr_error_tag; /* The tag from the ereport nvlist payload */ 287 uint32_t xr_error_index; /* the index from the ereport payload */ 288 uint64_t *xr_cache_data; /* The cache data */ 289 nvlist_t *xr_detector_nvlist; /* The detecting resource */ 290 #endif 291 }; 292 293 #define xr_rsrc_nvl xr_rsrc.fmri_nvl 294 295 extern cmd_xr_t *cmd_xr_create(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 296 cmd_cpu_t *, cmd_errcl_t); 297 extern cmd_evdisp_t cmd_xr_reschedule(fmd_hdl_t *, cmd_xr_t *, uint_t); 298 extern void cmd_xr_deref(fmd_hdl_t *, cmd_xr_t *); 299 extern void cmd_xr_write(fmd_hdl_t *, cmd_xr_t *); 300 301 extern void cmd_xxc_resolve(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *); 302 extern void cmd_xxu_resolve(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *); 303 extern void cmd_nop_resolve(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *); 304 extern cmd_evdisp_t cmd_xxcu_initial(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 305 const char *, cmd_errcl_t, uint_t); 306 307 /* 308 * The master structure containing or referencing all of the state for a given 309 * CPU. 310 */ 311 312 /* 313 * We periodically flush the E$, thus allowing us to flush the UE cache (see 314 * above for a description of the UE cache). In particular, we flush it 315 * whenever we see a UE with a non-valid AFAR. To keep from overflushing the 316 * CPU, we cap the number of flushes that we'll do in response to UEs with 317 * non-valid AFARs. The cap is the number of permitted flushes per GC/restart 318 * cycle, and was determined arbitrarily. 319 */ 320 #define CPU_UEC_FLUSH_MAX 3 321 322 /* 323 * The CPU structure started life without a version number. Making things more 324 * complicated, the version number in the new struct occupies the space used for 325 * cpu_cpuid in the non-versioned struct. We therefore have to use somewhat 326 * unorthodox version numbers to distinguish between the two types of struct 327 * (pre- and post-versioning) -- version numbers that can't be mistaken for 328 * CPUIDs. Our version numbers, therefore, will be negative. 329 * 330 * For future expansion, the version member must always stay where it is. At 331 * some point in the future, when more structs get versions, the version member 332 * should move into the cmd_header_t. 333 */ 334 #define CPU_MKVERSION(version) ((uint_t)(0 - (version))) 335 336 #define CMD_CPU_VERSION_1 CPU_MKVERSION(1) /* -1 */ 337 #define CMD_CPU_VERSION_2 CPU_MKVERSION(2) /* -2 */ 338 #define CMD_CPU_VERSION_3 CPU_MKVERSION(3) /* -3 */ 339 #define CMD_CPU_VERSION CMD_CPU_VERSION_3 340 341 #define CMD_CPU_VERSIONED(cpu) ((int)(cpu)->cpu_version < 0) 342 343 #define CMD_CPU_F_DELETING 0x1 344 345 typedef struct cmd_cpu_0 { 346 cmd_header_t cpu0_header; /* Nodetype must be CMD_NT_CPU */ 347 uint32_t cpu0_cpuid; /* Logical ID for this CPU */ 348 cmd_cpu_type_t cpu0_type; /* CPU model */ 349 fmd_case_t *cpu0_cases[4]; /* v0 had embedded case_t w/4 cases */ 350 uint8_t cpu0_faulting; /* Set if fault has been issued */ 351 cmd_fmri_t cpu0_asru; /* ASRU for this CPU */ 352 cmd_fmri_t cpu0_fru; /* FRU for this CPU */ 353 cmd_cpu_uec_t cpu0_uec; /* UE cache */ 354 cmd_cpu_uec_t cpu0_olduec; /* To-be-flushed UE cache */ 355 id_t cpu0_uec_flush; /* Timer ID for UE cache flush */ 356 uint_t cpu0_uec_nflushes; /* # of flushes since last restart/GC */ 357 cmd_list_t cpu0_xxu_retries; /* List of pending xxU retries */ 358 } cmd_cpu_0_t; 359 360 typedef struct cmd_cpu_1 { 361 cmd_header_t cpu1_header; /* Nodetype must be CMD_NT_CPU */ 362 uint_t cpu1_version; /* struct version - must follow hdr */ 363 uint32_t cpu1_cpuid; /* Logical ID for this CPU */ 364 cmd_cpu_type_t cpu1_type; /* CPU model */ 365 uintptr_t *cpu1_cases; /* v1 had a pointer to a case array */ 366 uint8_t cpu1_faulting; /* Set if fault has been issued */ 367 cmd_fmri_t cpu1_asru; /* ASRU for this CPU */ 368 cmd_fmri_t cpu1_fru; /* FRU for this CPU */ 369 cmd_cpu_uec_t cpu1_uec; /* UE cache */ 370 cmd_cpu_uec_t cpu1_olduec; /* To-be-flushed UE cache */ 371 id_t cpu1_uec_flush; /* Timer ID for UE cache flush */ 372 uint_t cpu1_uec_nflushes; /* # of flushes since last restart/GC */ 373 cmd_list_t cpu1_xxu_retries; /* List of pending xxU retries */ 374 } cmd_cpu_1_t; 375 376 typedef struct cmd_cpu_2 { 377 cmd_header_t cpu2_header; /* Nodetype must be CMD_NT_CPU */ 378 uint_t cpu2_version; /* struct version - must follow hdr */ 379 uint32_t cpu2_cpuid; /* Logical ID for this CPU */ 380 cmd_cpu_type_t cpu2_type; /* CPU model */ 381 uint8_t cpu2_faulting; /* Set if fault has been issued */ 382 cmd_fmri_t cpu2_asru; /* ASRU for this CPU */ 383 cmd_fmri_t cpu2_fru; /* FRU for this CPU */ 384 cmd_cpu_uec_t cpu2_uec; /* UE cache */ 385 cmd_cpu_uec_t cpu2_olduec; /* To-be-flushed UE cache */ 386 } cmd_cpu_2_t; 387 388 /* Portion of the cpu structure which must be persisted */ 389 typedef struct cmd_cpu_pers { 390 cmd_header_t cpup_header; /* Nodetype must be CMD_NT_CPU */ 391 uint_t cpup_version; /* struct version - must follow hdr */ 392 uint32_t cpup_cpuid; /* Logical ID for this CPU */ 393 cmd_cpu_type_t cpup_type; /* CPU model */ 394 uint8_t cpup_faulting; /* Set if fault has been issued */ 395 uint8_t cpup_level; /* cpu group level - 0 == thread */ 396 cmd_fmri_t cpup_asru; /* ASRU for this CPU */ 397 cmd_fmri_t cpup_fru; /* FRU for this CPU */ 398 cmd_cpu_uec_t cpup_uec; /* UE cache */ 399 cmd_cpu_uec_t cpup_olduec; /* To-be-flushed UE cache */ 400 } cmd_cpu_pers_t; 401 402 /* Persistent and dynamic CPU data */ 403 struct cmd_cpu { 404 cmd_cpu_pers_t cpu_pers; 405 cmd_cpu_cases_t cpu_cases; 406 id_t cpu_uec_flush; /* Timer ID for UE cache flush */ 407 uint_t cpu_uec_nflushes; /* # of flushes since last restart/GC */ 408 cmd_list_t cpu_xxu_retries; /* List of pending xxU retries */ 409 uint_t cpu_flags; 410 cmd_list_t cpu_Lxcaches; /* List of Lxcache state structures */ 411 fmd_stat_t Lxcache_creat; /* num of Lxcache states created */ 412 }; 413 414 #define CMD_CPU_MAXSIZE \ 415 MAX(MAX(sizeof (cmd_cpu_0_t), sizeof (cmd_cpu_1_t)), \ 416 MAX(sizeof (cmd_cpu_2_t), sizeof (cmd_cpu_pers_t))) 417 #define CMD_CPU_MINSIZE \ 418 MIN(MIN(sizeof (cmd_cpu_0_t), sizeof (cmd_cpu_1_t)), \ 419 MIN(sizeof (cmd_cpu_2_t), sizeof (cmd_cpu_pers_t))) 420 421 #define cpu_header cpu_pers.cpup_header 422 #define cpu_nodetype cpu_pers.cpup_header.hdr_nodetype 423 #define cpu_bufname cpu_pers.cpup_header.hdr_bufname 424 #define cpu_version cpu_pers.cpup_version 425 #define cpu_cpuid cpu_pers.cpup_cpuid 426 #define cpu_type cpu_pers.cpup_type 427 #define cpu_faulting cpu_pers.cpup_faulting 428 #define cpu_level cpu_pers.cpup_level 429 #define cpu_asru cpu_pers.cpup_asru 430 #define cpu_fru cpu_pers.cpup_fru 431 #define cpu_uec cpu_pers.cpup_uec 432 #define cpu_olduec cpu_pers.cpup_olduec 433 #define cpu_icache cpu_cases.cpuc_icache 434 #define cpu_dcache cpu_cases.cpuc_dcache 435 #define cpu_pcache cpu_cases.cpuc_pcache 436 #define cpu_itlb cpu_cases.cpuc_itlb 437 #define cpu_dtlb cpu_cases.cpuc_dtlb 438 #define cpu_l2data cpu_cases.cpuc_l2data 439 #define cpu_l2tag cpu_cases.cpuc_l2tag 440 #define cpu_l3data cpu_cases.cpuc_l3data 441 #define cpu_l3tag cpu_cases.cpuc_l3tag 442 #define cpu_fpu cpu_cases.cpuc_fpu 443 #define cpu_ireg cpu_cases.cpuc_ireg 444 #define cpu_freg cpu_cases.cpuc_freg 445 #define cpu_mau cpu_cases.cpuc_mau 446 #define cpu_l2ctl cpu_cases.cpuc_l2ctl 447 #define cpu_misc_regs cpu_cases.cpuc_misc_regs 448 #define cpu_lfu cpu_cases.cpuc_lfu 449 #ifdef sun4u 450 #define cpu_opl_invsfsr cpu_cases.cpuc_opl_invsfsr 451 #define cpu_oplue_detcpu cpu_cases.cpuc_oplue_detcpu 452 #define cpu_oplue_detio cpu_cases.cpuc_oplue_detio 453 #define cpu_opl_mtlb cpu_cases.cpuc_opl_mtlb 454 #define cpu_opl_tlbp cpu_cases.cpuc_opl_tlbp 455 #define cpu_opl_inv_urg cpu_cases.cpuc_opl_inv_urg 456 #define cpu_opl_cre cpu_cases.cpuc_opl_cre 457 #define cpu_opl_tsb_ctx cpu_cases.cpuc_opl_tsb_ctx 458 #define cpu_opl_tsbp cpu_cases.cpuc_opl_tsbp 459 #define cpu_opl_pstate cpu_cases.cpuc_opl_pstate 460 #define cpu_opl_tstate cpu_cases.cpuc_opl_tstate 461 #define cpu_opl_iug_f cpu_cases.cpuc_opl_iug_f 462 #define cpu_opl_iug_r cpu_cases.cpuc_opl_iug_r 463 #define cpu_opl_sdc cpu_cases.cpuc_opl_sdc 464 #define cpu_opl_wdt cpu_cases.cpuc_opl_wdt 465 #define cpu_opl_dtlb cpu_cases.cpuc_opl_dtlb 466 #define cpu_opl_itlb cpu_cases.cpuc_opl_itlb 467 #define cpu_opl_core_err cpu_cases.cpuc_opl_core_err 468 #define cpu_opl_dae cpu_cases.cpuc_opl_dae 469 #define cpu_opl_iae cpu_cases.cpuc_opl_iae 470 #define cpu_opl_uge cpu_cases.cpuc_opl_uge 471 #endif /* sun4u */ 472 473 #define cpu_asru_nvl cpu_asru.fmri_nvl 474 #define cpu_fru_nvl cpu_fru.fmri_nvl 475 476 /* 477 * L2$ and L3$ Data errors 478 * 479 * SERD name 480 * Type (if any) Fault 481 * ------ ----------- ------------------------------- 482 * xxC l2cachedata fault.cpu.<cputype>.l2cachedata 483 * xxU - fault.cpu.<cputype>.l2cachedata 484 * L3_xxC l3cachedata fault.cpu.<cputype>.l3cachedata 485 * L3_xxU - fault.cpu.<cputype>.l3cachedata 486 * 487 * NOTE: For the purposes of the discussion below, xxC and xxU refer to both 488 * L2$ and L3$ data errors. 489 * 490 * These ereports will be dropped if (among other things) they are side-effects 491 * of UEs (xxUs only) or other xxCs or xxUs. Whenever UEs are detected, they 492 * are added to a per-CPU cache. xxUs are then compared to this cache. If a 493 * xxU's AFAR refers to an address which recently saw a UE, the xxU is dropped, 494 * as it was most likely caused by the UE. When multiple xxCs and xxUs are seen 495 * with the same ENA, all save one are generally side-effects. We track these 496 * groups (referred to as trains), matching them against a premade list. If one 497 * of the trains matches, we drop all but the primary, which is indicated in the 498 * list. 499 * 500 * The expected resolution of l2cachedata and l3cachedata faults is the 501 * disabling of the indicated CPU. 502 */ 503 extern cmd_evdisp_t cmd_xxc(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 504 const char *, cmd_errcl_t); 505 extern cmd_evdisp_t cmd_xxu(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 506 const char *, cmd_errcl_t); 507 508 /* 509 * As of Niagara-2, we ignore writeback (ldwc, ldwu) errors. Since these were 510 * the only defined follow-on errors for sun4v trains, sun4v L2 cache data 511 * errors no longer need to use the train mechanism. 512 */ 513 514 extern cmd_evdisp_t cmd_l2c(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 515 const char *, cmd_errcl_t); 516 extern cmd_evdisp_t cmd_l2u(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 517 const char *, cmd_errcl_t); 518 519 /* 520 * Common Errdata structure for SERD engines 521 */ 522 typedef struct errdata { 523 cmd_serd_t *ed_serd; 524 const char *ed_fltnm; 525 const cmd_ptrsubtype_t ed_pst; 526 } errdata_t; 527 528 /* 529 * L2$ and L3$ Tag errors 530 * 531 * SERD name 532 * Type (if any) Fault 533 * ------- ----------- ------------------------------- 534 * TxCE l2cachetag fault.cpu.<cputype>.l2cachetag 535 * L3_THCE l3cachetag fault.cpu.<cputype>.l3cachetag 536 * LTC l2cachetag fault.cpu.<cputype>.l2cachetag 537 * 538 * We'll never see the uncorrectable Tag errors - they'll cause the machine to 539 * reset, and we'll be ne'er the wiser. 540 * 541 * The expected resolution of l2cachetag and l3cachetag faults is the disabling 542 * of the indicated CPU. 543 */ 544 extern cmd_evdisp_t cmd_txce(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 545 const char *, cmd_errcl_t); 546 547 extern cmd_evdisp_t cmd_l3_thce(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 548 const char *, cmd_errcl_t); 549 550 /* 551 * L1$ errors 552 * 553 * SERD name 554 * Type (if any) Fault 555 * ------- --------- ------------------------------- 556 * IPE icache fault.cpu.<cputype>.icache 557 * IxSPE icache fault.cpu.<cputype>.icache 558 * DPE dcache fault.cpu.<cputype>.dcache 559 * DxSPE dcache fault.cpu.<cputype>.dcache 560 * PDSPE pcache fault.cpu.<cputype>.pcache 561 * 562 * The I$, D$, and P$ are clean, and thus have no uncorrectable errors. 563 * 564 * The expected resolution of icache, dcache, and pcache faults is the disabling 565 * of the indicated CPU. 566 */ 567 extern cmd_evdisp_t cmd_icache(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 568 const char *, cmd_errcl_t); 569 extern cmd_evdisp_t cmd_dcache(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 570 const char *, cmd_errcl_t); 571 extern cmd_evdisp_t cmd_pcache(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 572 const char *, cmd_errcl_t); 573 574 /* 575 * TLB errors 576 * 577 * SERD name 578 * Type (if any) Fault 579 * ------ --------- ------------------------------- 580 * ITLBPE itlb fault.cpu.<cputype>.itlb 581 * DTLBPE dtlb fault.cpu.<cputype>.dtlb 582 * 583 * The expected resolution of itlb and dtlb faults is the disabling of the 584 * indicated CPU. 585 */ 586 extern cmd_evdisp_t cmd_itlb(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 587 const char *, cmd_errcl_t); 588 extern cmd_evdisp_t cmd_dtlb(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 589 const char *, cmd_errcl_t); 590 591 extern void cmd_cpuerr_close(fmd_hdl_t *, void *); 592 593 /* 594 * FPU errors 595 * 596 * SERD name 597 * Type (if any) Fault 598 * ------ --------- ------------------------------- 599 * FPU - fault.cpu.<cputype>.fpu 600 * 601 * The expected resolution of FPU faults is the disabling of the indicated CPU. 602 */ 603 extern cmd_evdisp_t cmd_fpu(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 604 const char *, cmd_errcl_t); 605 606 607 608 /* 609 * FPU (FP-Scrubber) errors 610 * 611 * SERD name 612 * Type (if any) Fault 613 * ------ --------- ------------------------------- 614 * FPU - fault.cpu.<cputype>.fpu 615 * 616 * The expected resolution of FPU faults is the disabling of the CPU 617 * indicted in the resource FMRI. 618 */ 619 extern cmd_evdisp_t cmd_fps(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 620 const char *, cmd_errcl_t); 621 622 623 624 625 626 /* 627 * ireg errors 628 * 629 * SERD name 630 * Type (if any) Fault 631 * ------ --------- ------------------------------- 632 * IRC ireg fault.cpu.<cputype>.ireg 633 * IRU - " 634 * 635 * The expected resolution of ireg faults is the disabling of the indicated CPU. 636 */ 637 extern cmd_evdisp_t cmd_irc(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 638 const char *, cmd_errcl_t); 639 extern cmd_evdisp_t cmd_iru(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 640 const char *, cmd_errcl_t); 641 642 /* 643 * freg errors 644 * 645 * SERD name 646 * Type (if any) Fault 647 * ------ --------- ------------------------------- 648 * FRC freg fault.cpu.ultraSPARC-T1.frc 649 * FRU - " .fru 650 * 651 * The expected resolution of freg faults is the repair of the indicated CPU. 652 */ 653 extern cmd_evdisp_t cmd_frc(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 654 const char *, cmd_errcl_t); 655 extern cmd_evdisp_t cmd_fru(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 656 const char *, cmd_errcl_t); 657 658 /* 659 * MAU errors 660 * 661 * SERD name 662 * Type (if any) Fault 663 * ------ --------- ------------------------------- 664 * MAU mau fault.cpu.<cputype>.mau 665 * 666 * The expected resolution of mau faults is the repair of the indicated CPU. 667 */ 668 extern cmd_evdisp_t cmd_mau(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 669 const char *, cmd_errcl_t); 670 671 /* 672 * L2CTL errors 673 * 674 * SERD name 675 * Type (if any) Fault 676 * ------ --------- ------------------------------- 677 * L2CTL - fault.cpu.<cputype>.l2ctl 678 * 679 * The expected resolution of l2ctl faults is the repair of the indicated CPU. 680 */ 681 extern cmd_evdisp_t cmd_l2ctl(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 682 const char *, cmd_errcl_t); 683 684 /* 685 * SBD (Storage Buffer Data) errors 686 * SCA (Scratchpath Array) erros 687 * TC (Tick compare) errors 688 * TSA (Trap stack Array) errors 689 * 690 * SERD name 691 * Type (if any) Fault 692 * ------ --------- ------------------------------- 693 * SBDC misc_regs fault.cpu.<cputype>.misc_regs 694 * SBDU 695 * SCAC, SCAU 696 * TCC, TCU 697 * TSAC, TSAU 698 * 699 * The expected resolution of misc_regs faults is the repair of 700 * the indicated CPU. 701 */ 702 extern cmd_evdisp_t cmd_miscregs_ce(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 703 const char *, cmd_errcl_t); 704 extern cmd_evdisp_t cmd_miscregs_ue(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 705 const char *, cmd_errcl_t); 706 707 extern cmd_evdisp_t cmd_miscregs_train(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 708 const char *, cmd_errcl_t); 709 710 /* 711 * Type Fault 712 * --------------------------------------------------------------------- 713 * LFU-RTF uncorrectable link retrain fail error fault.cpu.T2plus.lfu-u 714 * LFU-TTO uncorrectable training timeout error 715 * LFU-CTO uncorrectable config timeout error 716 * LFU-MLF uncorrectable multi lanes link fail error 717 * LFU-SLF correctable single lane failover fault.cpu.T2plus.lfu-f 718 * 719 * The expected resolution of lfu faults is the repair of the indicated CPU. 720 */ 721 extern cmd_evdisp_t cmd_lfu_ue(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 722 const char *, cmd_errcl_t); 723 extern cmd_evdisp_t cmd_lfu_ce(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 724 const char *, cmd_errcl_t); 725 /* 726 * Type Fault 727 * --------------------------------------------------------------------- 728 * Coherency link protocol errors 729 * to Transaction timed out fault.cpu.T2plus.lfu-p 730 * frack Invalid or redundant request ack 731 * fsr Invalid or redundant snoop response 732 * fdr Invalid or redundant data return 733 * snptyp Invalid snoop type received from 734 * coherency link 735 * 736 * The expected resolution of lfu faults is the repair of the indicated CPU. 737 */ 738 extern cmd_evdisp_t cmd_lfu_pe(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 739 const char *, cmd_errcl_t); 740 741 /* 742 * CPUs are described by FMRIs. This routine will retrieve the CPU state 743 * structure (creating a new one if necessary) described by the detector 744 * FMRI in the passed ereport. 745 */ 746 extern cmd_cpu_t *cmd_cpu_lookup_from_detector(fmd_hdl_t *, nvlist_t *, 747 const char *, uint8_t); 748 749 extern char *cmd_cpu_getfrustr(fmd_hdl_t *, cmd_cpu_t *); 750 extern char *cmd_cpu_getpartstr(fmd_hdl_t *, cmd_cpu_t *); 751 752 extern char *cmd_cpu_getserialstr(fmd_hdl_t *, cmd_cpu_t *); 753 extern nvlist_t *cmd_cpu_mkfru(fmd_hdl_t *, char *, char *, char *); 754 755 extern cmd_cpu_t *cmd_cpu_lookup(fmd_hdl_t *, nvlist_t *, const char *, 756 uint8_t); 757 758 extern void cmd_cpu_create_faultlist(fmd_hdl_t *, fmd_case_t *, cmd_cpu_t *, 759 const char *, nvlist_t *, uint_t); 760 761 extern cmd_cpu_t *cmd_restore_cpu_only(fmd_hdl_t *, fmd_case_t *, char *); 762 extern void cmd_cpu_destroy(fmd_hdl_t *, cmd_cpu_t *); 763 extern void *cmd_cpu_restore(fmd_hdl_t *, fmd_case_t *, cmd_case_ptr_t *); 764 extern void cmd_cpu_validate(fmd_hdl_t *); 765 extern void cmd_cpu_timeout(fmd_hdl_t *, id_t, void *); 766 extern void cmd_cpu_gc(fmd_hdl_t *); 767 extern void cmd_cpu_fini(fmd_hdl_t *hdl); 768 extern char *cmd_cpu_serdnm_create(fmd_hdl_t *, cmd_cpu_t *, const char *); 769 extern nvlist_t *cmd_cpu_fmri_create(uint32_t, uint8_t); 770 771 extern uint32_t cmd_cpu2core(uint32_t, cmd_cpu_type_t, uint8_t); 772 773 #define CMD_CPU_LEVEL_THREAD 0 774 #define CMD_CPU_LEVEL_CORE 1 775 #define CMD_CPU_LEVEL_CHIP 2 776 #define CMD_CPU_STAT_BUMP(cpu, name) cpu->name.fmds_value.ui64++ 777 778 typedef enum { 779 CMD_CPU_FAM_UNSUPPORTED, 780 CMD_CPU_FAM_CHEETAH, 781 CMD_CPU_FAM_NIAGARA, 782 CMD_CPU_FAM_SPARC64 783 } cpu_family_t; 784 785 typedef struct faminfo { 786 cpu_family_t fam_value; 787 boolean_t ecache_flush_needed; 788 } faminfo_t; 789 790 extern cpu_family_t cmd_cpu_check_support(void); 791 extern boolean_t cmd_cpu_ecache_support(void); 792 793 extern int cmd_xr_fill(fmd_hdl_t *, nvlist_t *, cmd_xr_t *, cmd_errcl_t); 794 extern void cmd_fill_errdata(cmd_errcl_t, cmd_cpu_t *, cmd_case_t **, 795 const errdata_t **); 796 extern cmd_xxcu_trw_t *cmd_trw_lookup(uint64_t, uint8_t, uint64_t); 797 extern cmd_evdisp_t cmd_nop_train(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 798 const char *, cmd_errcl_t); 799 extern cmd_errcl_t cmd_train_match(cmd_errcl_t, cmd_errcl_t); 800 extern int cmd_afar_status_check(uint8_t, cmd_errcl_t); 801 802 #ifdef sun4u 803 extern int cmd_cpu_synd_check(uint16_t, cmd_errcl_t clcode); 804 #else /* sun4u */ 805 extern int cmd_cpu_synd_check(uint32_t, cmd_errcl_t clcode); 806 #endif /* sun4u */ 807 808 extern int cmd_afar_valid(fmd_hdl_t *hdl, nvlist_t *nvl, cmd_errcl_t, 809 uint64_t *afar); 810 811 #ifdef __cplusplus 812 } 813 #endif 814 815 #endif /* _CMD_CPU_H */ 816