11414Scindi/* 21414Scindi * CDDL HEADER START 31414Scindi * 41414Scindi * The contents of this file are subject to the terms of the 51493Sgavinm * Common Development and Distribution License (the "License"). 61493Sgavinm * You may not use this file except in compliance with the License. 71414Scindi * 81414Scindi * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 91414Scindi * or http://www.opensolaris.org/os/licensing. 101414Scindi * See the License for the specific language governing permissions 111414Scindi * and limitations under the License. 121414Scindi * 131414Scindi * When distributing Covered Code, include this CDDL HEADER in each 141414Scindi * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 151414Scindi * If applicable, add the following below this CDDL HEADER, with the 161414Scindi * fields enclosed by brackets "[]" replaced with your own identifying 171414Scindi * information: Portions Copyright [yyyy] [name of copyright owner] 181414Scindi * 191414Scindi * CDDL HEADER END 201414Scindi */ 211414Scindi 221414Scindi/* 23*11947Ssrihari.venkatesan@oracle.com * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 241414Scindi * Use is subject to license terms. 251414Scindi */ 261414Scindi 271414Scindi#pragma dictionary "AMD" 281414Scindi 291414Scindi/* 301414Scindi * Eversholt rules for the AMD Opteron CPU/Memory 311414Scindi */ 321414Scindi 332869Sgavinm#define MAX(x, y) ((x) >= (y) ? (x) : (y)) 342869Sgavinm#define MIN(x, y) ((x) <= (y) ? (x) : (y)) 352869Sgavinm 362869Sgavinm/* 372869Sgavinm * SET_ADDR and SET_OFFSET are used to set a payload value in the fault that 382869Sgavinm * we diagnose for page faults, to record the physical address of the faulting 397197Sstephh * page. 401414Scindi */ 415254Sgavinm#define SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR"))) 421414Scindi 434436Sstephh#define SET_OFFSET (setpayloadprop("asru-offset", \ 444436Sstephh payloadprop("resource[0].hc-specific.offset"))) 451414Scindi 461414Scindi/* 472869Sgavinm * RESOURCE_EXISTS is true if a member with name "resource" exists in the 481414Scindi * payload - regardless of type (e.g., nvlist or nvlist array) or value. 491414Scindi */ 501414Scindi#define RESOURCE_EXISTS (payloadprop_defined("resource")) 511414Scindi 521414Scindi/* 532869Sgavinm * CONTAINS_RANK is true if the "resource" nvlist array (as used in memory 541414Scindi * ereports) exists and one if its members matches the path for the 552869Sgavinm * rank node. Our memory propogation are of the form 562869Sgavinm * 577532SSean.Ye@Sun.COM * "prop foo@chip/memory-controller/dimm/rank -> blah@chip/core/strand" 582869Sgavinm * 591414Scindi * since cpus detect memory errors; in eversholt such a propogation, where 601414Scindi * the lhs path and rhs path do not match, expands to the cross-product of 612869Sgavinm * all dimms, ranks and cpus on the same chip (since chip appears in the 622869Sgavinm * path on both sides). We use CONTAINS_RANK to constrain the propogation 632869Sgavinm * such that it only happens if the payload resource matches the rank. 641414Scindi */ 652869Sgavinm#define CONTAINS_RANK (payloadprop_contains("resource", \ 664436Sstephh asru(chip/memory-controller/dimm/rank)) \ 674436Sstephh || payloadprop_contains("resource", \ 684436Sstephh asru(chip/memory-controller/dimm))) 691414Scindi 701414Scindi/* 711414Scindi * The following will tell us whether a syndrome that is known to be 722869Sgavinm * correctable (from a mem_ce ereport) is single-bit or multi-bit. For a 731414Scindi * correctable ChipKill syndrome the number of bits set in the lowest 742869Sgavinm * nibble indicates how many bits were in error. 751414Scindi */ 761414Scindi 771414Scindi#define CBITMASK(synd) ((synd) & 0xf) 781414Scindi 791414Scindi#define CKSINGLE(synd) \ 801414Scindi ((synd) == 0 || \ 811414Scindi (CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 || \ 821414Scindi CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8)) 831414Scindi 841414Scindi#define SINGLE_BIT_CE \ 851414Scindi (payloadprop("syndrome-type") == "E" || \ 86*11947Ssrihari.venkatesan@oracle.com (payloadprop("syndrome-type") == "C4" && \ 871414Scindi CKSINGLE(payloadprop("syndrome")))) 881414Scindi 891414Scindi#define MULTI_BIT_CE \ 90*11947Ssrihari.venkatesan@oracle.com (payloadprop("syndrome-type") == "C4" && \ 911414Scindi !CKSINGLE(payloadprop("syndrome"))) 921414Scindi 937197Sstephh/* #PAGE# 947197Sstephh * #DIMM_SCU# 952869Sgavinm * A single bit fault in a memory rank can cause: 961414Scindi * 972869Sgavinm * - mem_ce : reported by nb 982869Sgavinm * - inf_sys_ecc1: reported by ic or dc; inf_sys_ecc1 errors detected at the 992869Sgavinm * ic do not record a syndrome; these errors will not be triggered in 1002869Sgavinm * ChipKill ECC mode (the NB corrects all ECC errors in that mode) 1012869Sgavinm * - s_ecc1: reported by bu; this error will not be triggered in ChipKill 1022869Sgavinm * ECC mode (the NB corrects all ECC in that mode) 1031414Scindi * 1042869Sgavinm * Single-bit errors are fed into a per-rank SERD engine; if a SERD engine 1051414Scindi * trips we diagnose a fault.memory.page so that the response agent can 1061414Scindi * retire the page that caused the trip. If the total number of pages 1072869Sgavinm * faulted in this way on a single rank exceeds a threshold we will 1087197Sstephh * diagnose a fault.memory.dimm_sb against the containing dimm. 1091414Scindi * 1102869Sgavinm * Multibit ChipKill-correctable errors are treated identically to 1112869Sgavinm * single-bit errors, but via separate serd engines to allow distinct 1122869Sgavinm * parameters if desired. 1131414Scindi * 1141414Scindi * Uncorrectable errors produce an immediate page fault and corresponding 1151414Scindi * fault.memory.dimm_ue. 1161414Scindi * 1171414Scindi * Page faults are essentially internal - action is only required when 1181414Scindi * they are accompanied by a dimm fault. As such we include message=0 1192869Sgavinm * on page faults. 1201414Scindi */ 1211414Scindi 1227532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.inf_sys_ecc1@chip/core/strand{within(5s)}; 1237532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand{within(5s)}; 1247532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.s_ecc1@chip/core/strand{within(5s)}; 1257532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.mem_ce@chip/core/strand{within(5s)}; 1261414Scindi 1271414Scindi/* 1287197Sstephh * Single-bit correctable errors feed into per-rank 1292869Sgavinm * SERD engines which diagnose fault.memory.page_sb if they trip. 1301414Scindi * 1317197Sstephh * Multi-bit correctable (via ChipKill) errors feed 1322869Sgavinm * into additional per-rank SERD engines which diagnose fault.memory.page_ck 1332869Sgavinm * if they trip. 1342869Sgavinm * 1352869Sgavinm * The number of fault.memory.page and fault.memory.page_ck diagnosed is 1362869Sgavinm * counted in stat engines for each type. These are used in deciding 1372869Sgavinm * whether to declare a dimm faulty after repeated page faults. 1381414Scindi */ 1391414Scindi 1401414Scindi#define PAGE_SB_COUNT 2 1411414Scindi#define PAGE_SB_TIME 72h 1422869Sgavinm#define PAGE_CK_COUNT 2 1432869Sgavinm#define PAGE_CK_TIME 72h 1441414Scindi 1457197Sstephhengine stat.sbpgflt@chip/memory-controller/dimm/rank; 1467197Sstephhengine stat.ckpgflt@chip/memory-controller/dimm/rank; 1477197Sstephhengine serd.memory.page_sb@chip/memory-controller/dimm/rank, 1487197Sstephh N=PAGE_SB_COUNT, T=PAGE_SB_TIME; 1497197Sstephhengine serd.memory.page_ck@chip/memory-controller/dimm/rank, 1507197Sstephh N=PAGE_CK_COUNT, T=PAGE_CK_TIME; 1517197Sstephhengine serd.memory.dimm_sb@chip/memory-controller/dimm/rank, 1527197Sstephh N=PAGE_SB_COUNT, T=PAGE_SB_TIME; 1537197Sstephhengine serd.memory.dimm_ck@chip/memory-controller/dimm/rank, 1547197Sstephh N=PAGE_CK_COUNT, T=PAGE_CK_TIME; 1557197Sstephhevent fault.memory.page_sb@chip/memory-controller/dimm/rank, message=0, 1567197Sstephh count=stat.sbpgflt@chip/memory-controller/dimm/rank, response=0, 1577197Sstephh engine=serd.memory.page_sb@chip/memory-controller/dimm/rank; 1587197Sstephhevent fault.memory.page_ck@chip/memory-controller/dimm/rank, message=0, 1597197Sstephh count=stat.ckpgflt@chip/memory-controller/dimm/rank, response=0, 1607197Sstephh engine=serd.memory.page_ck@chip/memory-controller/dimm/rank; 1617197Sstephhevent fault.memory.dimm_sb@chip/memory-controller/dimm/rank, 1627197Sstephh engine=serd.memory.dimm_sb@chip/memory-controller/dimm/rank; 1637197Sstephhevent fault.memory.dimm_ck@chip/memory-controller/dimm/rank, 1647197Sstephh engine=serd.memory.dimm_ck@chip/memory-controller/dimm/rank; 1657197Sstephh 1662869Sgavinm/* 1672869Sgavinm * The fraction of pages on a single rank that must be diagnosed as faulty 1682869Sgavinm * with single correctable unit faults before we will fault the rank. 1692869Sgavinm * Once we have faulted the rank we will continue to diagnose any further page 1702869Sgavinm * faults on the rank up to some maximum multiple of the threshold at which 1712869Sgavinm * we faulted the dimm. This allows us to potentially contain some fairly 1722869Sgavinm * far-reaching but still limited-extent fault (such as a partial column 1732869Sgavinm * failure) without getting carried away and allowing a single faulty rank to 1742869Sgavinm * use up the entire system-imposed page retirenment limit (which, once 1752869Sgavinm * reached, causes retirement request to have no effect other than to fill 1762869Sgavinm * the fault manager cache and logs). 1772869Sgavinm * 1782869Sgavinm * This fraction is specified in basis points, where 100 basis points are 1792869Sgavinm * equivalent to 1 percent. It is applied on a per-rank basis. 1802869Sgavinm * 1812869Sgavinm * The system imposes an absolute maximum on the number of pages it will 1822869Sgavinm * retire; the current value is 10 basis points, or 0.1% of 'physmem'. Note 1832869Sgavinm * that 'physmem' is reduced from installed memory pages by an amount 1842869Sgavinm * reflecting permanent kernel memory allocations. This system page retire 1852869Sgavinm * limit bounds the maximum real response to page faults across all ranks 1862869Sgavinm * that fault manager response agents can effect, but it should not be confused 1872869Sgavinm * with any diagnosis threshold (i.e., the number of faulty pages we are 1882869Sgavinm * prepared to tolerate from a single rank before faulting the rank is 1892869Sgavinm * distinct from the total number of pages we are prepared to retire from use 1902869Sgavinm * in response to that and other faults). It is, however, desirable to 1912869Sgavinm * arrange that the maximum number of pages we are prepared to fault from 1922869Sgavinm * any one rank is less than the system-wide quota. 1932869Sgavinm */ 1942869Sgavinm#define PAGE_RETIRE_LIMIT_BPS 5 /* or 0.05%; ~ 131 pages/GB %/ 1952869Sgavinm 1962869Sgavinm/* 1972869Sgavinm * A macro to manipulate the above fraction. Given a size in bytes convert 1982869Sgavinm * this to pages (4K pagesize) and calculate the number of those pages 1992869Sgavinm * indicated by PAGE_RETIRE_LIMIT_BPS basis points. 2002869Sgavinm */ 2012869Sgavinm#define _BPS_PGCNT(totalbytes) \ 2022869Sgavinm ((((totalbytes) / 4096 ) * PAGE_RETIRE_LIMIT_BPS) / 10000) 2031414Scindi 2042869Sgavinm/* 2052869Sgavinm * The single-correctable-unit threshold at which number of faulted pages 2062869Sgavinm * on a rank we we fault the rank. We insist that this be at least 128 and 2072869Sgavinm * never more than 512. 2082869Sgavinm */ 2092869Sgavinm#define RANK_THRESH MIN(512, MAX(128, \ 2107197Sstephh _BPS_PGCNT(confprop(chip/memory-controller/dimm/rank, "size")))) 2112869Sgavinm 2122869Sgavinm/* 2132869Sgavinm * The maximum number of single-correctable-unit page faults we will diagnose 2142869Sgavinm * on a single rank (must be greater than RANK_THRESH). We set 2152869Sgavinm * this at twice the rank fault threshold. 2162869Sgavinm */ 2172869Sgavinm#define RANK_PGFLT_MAX (2 * RANK_THRESH) 2182869Sgavinm 2192869Sgavinm#define SB_PGFLTS (count(stat.sbpgflt@chip/memory-controller/dimm/rank)) 2202869Sgavinm#define CK_PGFLTS (count(stat.ckpgflt@chip/memory-controller/dimm/rank)) 2212869Sgavinm 2227197Sstephh/* 2232869Sgavinm * "Single-correctable-unit" DIMM faults are diagnosed when the total number of 2242869Sgavinm * page faults (diagnosed from repeated single-bit or multibit-chipkills) 2252869Sgavinm * from any one rank on that DIMM reaches a threshold. A "correctable unit" 2262869Sgavinm * is a single bit in normal 64/8 ECC mode, or a single symbol in ChipKill 2272869Sgavinm * 128/16 mode (i.e., nibble-aligned nibble for the code used on Opteron). 2281414Scindi * 2292869Sgavinm * We do not stop diagnosing further single-bit page faults once we have 2302869Sgavinm * declared a single-bit DIMM fault - we continue diagnosing them and 2312869Sgavinm * response agents can continue to retire those pages up to the system-imposed 2322869Sgavinm * retirement limit. 2332869Sgavinm * 2342869Sgavinm * Two distinct fault types may be diagnosed - fault.memory.dimm_sb and 2352869Sgavinm * fault.memory.dimm_ck. Which one is diagnosed depends on whether we 2362869Sgavinm * have reached the threshold for a majority of single-bit page faults or 2372869Sgavinm * multibit page faults. 2382869Sgavinm * 2392869Sgavinm * Implementation: we maintain parallel SERD engines to the page_sb and 2402869Sgavinm * page_ck engines, which trip in unison. On trip it generates a distinct 2417197Sstephh * ereport which we diagnose to a fault if the threshold has been reached. 2421414Scindi */ 2437197Sstephhprop fault.memory.page_sb@chip/memory-controller/dimm/rank 2447197Sstephh { CONTAINS_RANK && SINGLE_BIT_CE && 2457197Sstephh SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)-> 2469078SStephen.Hanson@Sun.COM ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>, 2479078SStephen.Hanson@Sun.COM ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>, 2489078SStephen.Hanson@Sun.COM ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>; 2491414Scindi 2507197Sstephhprop fault.memory.page_ck@chip/memory-controller/dimm/rank 2517197Sstephh { CONTAINS_RANK && !SINGLE_BIT_CE && 2527197Sstephh SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)-> 2539078SStephen.Hanson@Sun.COM ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>, 2549078SStephen.Hanson@Sun.COM ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>, 2559078SStephen.Hanson@Sun.COM ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>; 2561414Scindi 2577197Sstephhprop fault.memory.dimm_sb@chip/memory-controller/dimm/rank 2587197Sstephh { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH && 2597197Sstephh SB_PGFLTS > RANK_THRESH / 2 } (1)-> 2609078SStephen.Hanson@Sun.COM ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>, 2619078SStephen.Hanson@Sun.COM ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>, 2629078SStephen.Hanson@Sun.COM ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>; 2637197Sstephh 2647197Sstephhprop fault.memory.dimm_ck@chip/memory-controller/dimm/rank 2657197Sstephh { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH && 2667197Sstephh CK_PGFLTS > RANK_THRESH / 2 } (1)-> 2679078SStephen.Hanson@Sun.COM ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>; 2682869Sgavinm 2692869Sgavinm/* 2707197Sstephh * If the address is not valid then no resource member will be included 2717197Sstephh * in a nb.mem_ce or nb.mem_ue ereport. These cases should be rare. 2727197Sstephh * We will also discard all inf_sys_ecc1 events detected at the ic since they 2737197Sstephh * have no syndrome and therefore no resource information. 2747197Sstephh * We will discard such ereports. An alternative may be to SERD them 2757197Sstephh * on a per MC basis and trip if we see too many such events. 2761414Scindi */ 2777532SSean.Ye@Sun.COMevent upset.memory.discard1@chip/core/strand; 2787532SSean.Ye@Sun.COMprop upset.memory.discard1@chip/core/strand 2797197Sstephh { !RESOURCE_EXISTS } (1)-> 2807532SSean.Ye@Sun.COM ereport.cpu.amd.ic.inf_sys_ecc1@chip/core/strand, 2817532SSean.Ye@Sun.COM ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand, 2827532SSean.Ye@Sun.COM ereport.cpu.amd.bu.s_ecc1@chip/core/strand, 2837532SSean.Ye@Sun.COM ereport.cpu.amd.nb.mem_ce@chip/core/strand; 2841414Scindi 2851414Scindi/* #DIMM_UE# 2862869Sgavinm * #PAGE_UE# 2872869Sgavinm * An uncorrectable multi-bit fault in a memory dimm can cause: 2881414Scindi * 2892869Sgavinm * - mem_ue : reported by nb for an access from a remote cpu 2902869Sgavinm * - inf_sys_eccm : reported by ic or dc; the ic does not report a syndrome 2912869Sgavinm * - s_eccm : reported by bu 2921414Scindi * 2932869Sgavinm * Since on production systems we force HT Sync Flood on uncorrectable 2942869Sgavinm * memory errors (if not already set as such by the BIOS, as it should be) 2952869Sgavinm * we won't actually receive these ereports since the system will be reset. 2961414Scindi */ 2971414Scindi 2987532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand{within(5s)}; 2997532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand{within(5s)}; 3007532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.s_eccm@chip/core/strand{within(5s)}; 3017532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.mem_ue@chip/core/strand{within(5s)}; 3022869Sgavinm 3037197Sstephhevent fault.memory.dimm_ue@chip/memory-controller/dimm/rank; 3047197Sstephhevent fault.memory.page_ue@chip/memory-controller/dimm/rank, message=0, 3057197Sstephh response=0; 3062869Sgavinm 3077197Sstephhprop fault.memory.dimm_ue@chip/memory-controller/dimm/rank 3087197Sstephh { CONTAINS_RANK } (1)-> 3099078SStephen.Hanson@Sun.COM ereport.cpu.amd.ic.inf_sys_eccm@chip/core<>/strand<>, 3109078SStephen.Hanson@Sun.COM ereport.cpu.amd.dc.inf_sys_eccm@chip/core<>/strand<>, 3119078SStephen.Hanson@Sun.COM ereport.cpu.amd.bu.s_eccm@chip/core<>/strand<>, 3129078SStephen.Hanson@Sun.COM ereport.cpu.amd.nb.mem_ue@chip/core<>/strand<>; 3132869Sgavinm 3147197Sstephhprop fault.memory.page_ue@chip/memory-controller/dimm/rank 3157197Sstephh { CONTAINS_RANK && SET_ADDR && SET_OFFSET } (1)-> 3169078SStephen.Hanson@Sun.COM ereport.cpu.amd.ic.inf_sys_eccm@chip/core<>/strand<>, 3179078SStephen.Hanson@Sun.COM ereport.cpu.amd.dc.inf_sys_eccm@chip/core<>/strand<>, 3189078SStephen.Hanson@Sun.COM ereport.cpu.amd.bu.s_eccm@chip/core<>/strand<>, 3199078SStephen.Hanson@Sun.COM ereport.cpu.amd.nb.mem_ue@chip/core<>/strand<>; 3201414Scindi 3217532SSean.Ye@Sun.COMevent upset.memory.discard3@chip/core/strand; 3227532SSean.Ye@Sun.COMprop upset.memory.discard3@chip/core/strand 3234436Sstephh { !RESOURCE_EXISTS } (1)-> 3247532SSean.Ye@Sun.COM ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand, 3257532SSean.Ye@Sun.COM ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand, 3267532SSean.Ye@Sun.COM ereport.cpu.amd.bu.s_eccm@chip/core/strand, 3277532SSean.Ye@Sun.COM ereport.cpu.amd.nb.mem_ue@chip/core/strand; 3281414Scindi 3292869Sgavinm/* #CSTESTFAIL# 3302869Sgavinm * If the BIOS fails a chip-select during POST, or perhaps after a 3312869Sgavinm * sync flood from an uncorrectable error, then on revision F and G it 3322869Sgavinm * should mark that chip-select as TestFail in the CS Base register. 3332869Sgavinm * When the memory-controller driver discovers all the MC configuration 3342869Sgavinm * it notes such failed chip-selects and creates topology nodes for the 3352869Sgavinm * chip-select and associated dimms and ranks, and produces an ereport for each 3362869Sgavinm * failed chip-select with detector set to the memory-controller node 3372869Sgavinm * and resource indicating the failed chip-select. 3382869Sgavinm */ 3392869Sgavinm 3404436Sstephhevent ereport.cpu.amd.mc.cs_testfail@chip/memory-controller{within(5s)}; 3417197Sstephhevent fault.memory.dimm_testfail@chip/memory-controller/dimm/rank; 3422869Sgavinmevent error.memory.cs_testfail@chip/memory-controller/chip-select; 3432869Sgavinm 3442869Sgavinm#define CONTAINS_CS (payloadprop_contains("resource", \ 3452869Sgavinm asru(chip/memory-controller/chip-select))) 3461414Scindi 3474436Sstephhprop error.memory.cs_testfail@chip/memory-controller/chip-select (1)-> 3482869Sgavinm ereport.cpu.amd.mc.cs_testfail@chip/memory-controller 3492869Sgavinm { CONTAINS_CS }; 3502869Sgavinm 3512869Sgavinm#define CSMATCH(s) \ 3527197Sstephh (confprop_defined(chip/memory-controller/chip-select, s) && \ 3537197Sstephh confprop(chip/memory-controller/chip-select, s) == \ 3547197Sstephh confprop(chip/memory-controller/dimm/rank, "csname")) 3552869Sgavinm 3569078SStephen.Hanson@Sun.COMprop fault.memory.dimm_testfail@chip/memory-controller/dimm/rank (0)-> 3572869Sgavinm error.memory.cs_testfail@chip/memory-controller/chip-select 3582869Sgavinm { CSMATCH("dimm1-csname") || CSMATCH("dimm2-csname")}; 3591414Scindi 3602869Sgavinm/* #ADDRPAR# 3612869Sgavinm * DRAM Command/Address Parity Errors. 3622869Sgavinm * 3632869Sgavinm * - dramaddr_par : reported by the nb; the NB status register includes 3642869Sgavinm * a bit indicating which dram controller channel (A or B) experienced 3652869Sgavinm * the error. 3662869Sgavinm */ 3672869Sgavinm 3687532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.dramaddr_par@chip/core/strand{within(5s)}; 3697197Sstephhevent fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel, response=0; 3702869Sgavinm 3712869Sgavinmprop fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel[y] (0)-> 3727532SSean.Ye@Sun.COM ereport.cpu.amd.nb.dramaddr_par@chip/core/strand { 3737197Sstephh ((payloadprop("IA32_MCi_STATUS") >> 32 & 0x200) ? 1 : 0) == y }; 3741414Scindi 3751414Scindi/* #L2D_SINGLE# 3761414Scindi * A single bit data array fault in an l2 cache can cause: 3771414Scindi * 3781414Scindi * - inf_l2_ecc1 : reported by ic on this cpu 3791414Scindi * - inf_l2_ecc1 : reported by dc on this cpu 3801414Scindi * - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu 3811414Scindi */ 3821414Scindi 3837197Sstephh#define L2CACHEDATA_SB_COUNT 3 3847197Sstephh#define L2CACHEDATA_SB_TIME 12h 3857197Sstephh 3867532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.inf_l2_ecc1@chip/core/strand{within(5s)}; 3877532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.inf_l2_ecc1@chip/core/strand{within(5s)}; 3887532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.l2d_ecc1@chip/core/strand{within(5s)}; 3897532SSean.Ye@Sun.COMengine serd.cpu.amd.l2d_sb@chip/core/strand, 3907197Sstephh N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME; 3917532SSean.Ye@Sun.COMevent fault.cpu.amd.l2cachedata@chip/core/strand, engine=serd.cpu.amd.l2d_sb@chip/core/strand; 3921414Scindi 3937532SSean.Ye@Sun.COMprop fault.cpu.amd.l2cachedata@chip/core/strand (0)-> 3947532SSean.Ye@Sun.COM ereport.cpu.amd.ic.inf_l2_ecc1@chip/core/strand, 3957532SSean.Ye@Sun.COM ereport.cpu.amd.dc.inf_l2_ecc1@chip/core/strand, 3967532SSean.Ye@Sun.COM ereport.cpu.amd.bu.l2d_ecc1@chip/core/strand; 3971414Scindi 3981414Scindi/* #L2D_MULTI# 3991414Scindi * A multi-bit data array fault in an l2 cache can cause: 4001414Scindi * 4011414Scindi * - inf_l2_eccm : reported by ic on this cpu 4021414Scindi * - inf_l2_eccm : reported by dc on this cpu 4031414Scindi * - l2d_eccm : reported by bu on copyback or on snoop from another cpu 4041414Scindi */ 4051414Scindi 4067532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.inf_l2_eccm@chip/core/strand{within(5s)}; 4077532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.inf_l2_eccm@chip/core/strand{within(5s)}; 4087532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.l2d_eccm@chip/core/strand{within(5s)}; 4091414Scindi 4107532SSean.Ye@Sun.COMprop fault.cpu.amd.l2cachedata@chip/core/strand 4117197Sstephh { setserdincrement(L2CACHEDATA_SB_COUNT + 1) } (0)-> 4127532SSean.Ye@Sun.COM ereport.cpu.amd.ic.inf_l2_eccm@chip/core/strand, 4137532SSean.Ye@Sun.COM ereport.cpu.amd.dc.inf_l2_eccm@chip/core/strand, 4147532SSean.Ye@Sun.COM ereport.cpu.amd.bu.l2d_eccm@chip/core/strand; 4151414Scindi 4161414Scindi/* #L2T_SINGLE# 4171414Scindi * A single bit tag array fault in an l2 cache can cause: 4181414Scindi * 4191414Scindi * - l2t_ecc1 : reported by bu on this cpu when detected during snoop 4201414Scindi * - l2t_par : reported by bu on this cpu when detected other than during snoop 4211414Scindi */ 4221414Scindi 4237197Sstephh#define L2CACHETAG_SB_COUNT 3 4247197Sstephh#define L2CACHETAG_SB_TIME 12h 4257197Sstephh 4267532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.l2t_ecc1@chip/core/strand{within(5s)}; 4277532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.l2t_par@chip/core/strand{within(5s)}; 4287532SSean.Ye@Sun.COMengine serd.cpu.amd.l2t_sb@chip/core/strand, 4297197Sstephh N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME; 4307532SSean.Ye@Sun.COMevent fault.cpu.amd.l2cachetag@chip/core/strand, engine=serd.cpu.amd.l2t_sb@chip/core/strand; 4311414Scindi 4327532SSean.Ye@Sun.COMprop fault.cpu.amd.l2cachetag@chip/core/strand (0)-> 4337913SStephen.Hanson@Sun.COM ereport.cpu.amd.bu.l2t_ecc1@chip/core/strand; 4341414Scindi 4351414Scindi/* #L2T_MULTI# 4361414Scindi * A multi-bit tag array fault in an l2 cache can cause: 4371414Scindi * 4381414Scindi * - l2t_eccm : reported by bu on this cpu when detected during snoop 4391414Scindi * - l2t_par : reported by bu on this cpu when detected other than during snoop 4401414Scindi */ 4411414Scindi 4427532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.l2t_eccm@chip/core/strand{within(5s)}; 4431414Scindi 4447532SSean.Ye@Sun.COMprop fault.cpu.amd.l2cachetag@chip/core/strand 4457197Sstephh { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)-> 4467532SSean.Ye@Sun.COM ereport.cpu.amd.bu.l2t_eccm@chip/core/strand, 4477532SSean.Ye@Sun.COM ereport.cpu.amd.bu.l2t_par@chip/core/strand; 4481414Scindi 4491414Scindi/* #ICD_PAR# 4501414Scindi * A data array parity fault in an I cache can cause: 4511414Scindi * 4521414Scindi * - data_par : reported by ic on this cpu 4531414Scindi */ 4541414Scindi 4551414Scindi#define ICACHEDATA_SB_COUNT 2 4561414Scindi#define ICACHEDATA_SB_TIME 168h 4571414Scindi 4587532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.data_par@chip/core/strand{within(5s)}; 4597532SSean.Ye@Sun.COMengine serd.cpu.amd.icachedata@chip/core/strand, 4607197Sstephh N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME; 4617532SSean.Ye@Sun.COMevent fault.cpu.amd.icachedata@chip/core/strand, 4627532SSean.Ye@Sun.COM engine=serd.cpu.amd.icachedata@chip/core/strand; 4631414Scindi 4647532SSean.Ye@Sun.COMprop fault.cpu.amd.icachedata@chip/core/strand (0)-> 4657532SSean.Ye@Sun.COM ereport.cpu.amd.ic.data_par@chip/core/strand; 4661414Scindi 4671414Scindi/* #ICT_PAR# 4681414Scindi * A tag array parity fault in an I cache can cause: 4691414Scindi * 4701414Scindi * - tag_par : reported by ic on this cpu 4711414Scindi */ 4721414Scindi 4731414Scindi#define ICACHETAG_SB_COUNT 2 4741414Scindi#define ICACHETAG_SB_TIME 168h 4751414Scindi 4767532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.tag_par@chip/core/strand{within(5s)}; 4777532SSean.Ye@Sun.COMengine serd.cpu.amd.icachetag@chip/core/strand, 4787197Sstephh N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME; 4797532SSean.Ye@Sun.COMevent fault.cpu.amd.icachetag@chip/core/strand, engine=serd.cpu.amd.icachetag@chip/core/strand; 4801414Scindi 4817532SSean.Ye@Sun.COMprop fault.cpu.amd.icachetag@chip/core/strand (0)-> 4827532SSean.Ye@Sun.COM ereport.cpu.amd.ic.tag_par@chip/core/strand; 4831414Scindi 4841414Scindi/* #ICT_SNOOP# 4851414Scindi * A snoop tag array parity fault in an I cache can cause: 4861414Scindi * 4871414Scindi * - stag_par : reported by ic on this cpu 4881414Scindi */ 4891414Scindi 4907532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.stag_par@chip/core/strand{within(5s)}; 4917532SSean.Ye@Sun.COMevent fault.cpu.amd.icachestag@chip/core/strand; 4921414Scindi 4937532SSean.Ye@Sun.COMprop fault.cpu.amd.icachestag@chip/core/strand (1)-> 4947532SSean.Ye@Sun.COM ereport.cpu.amd.ic.stag_par@chip/core/strand; 4951414Scindi 4961414Scindi/* #ICTLB_1# 4971414Scindi * An l1tlb parity fault in an I cache can cause: 4981414Scindi * 4991414Scindi * - l1tlb_par : reported by ic on this cpu 5001414Scindi */ 5011414Scindi 5021414Scindi#define ICACHEL1TLB_SB_COUNT 2 5031414Scindi#define ICACHEL1TLB_SB_TIME 168h 5041414Scindi 5057532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.l1tlb_par@chip/core/strand{within(5s)}; 5067532SSean.Ye@Sun.COMengine serd.cpu.amd.l1itlb@chip/core/strand, 5077197Sstephh N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME; 5087532SSean.Ye@Sun.COMevent fault.cpu.amd.l1itlb@chip/core/strand, engine=serd.cpu.amd.l1itlb@chip/core/strand; 5091414Scindi 5107532SSean.Ye@Sun.COMprop fault.cpu.amd.l1itlb@chip/core/strand (0)-> 5117532SSean.Ye@Sun.COM ereport.cpu.amd.ic.l1tlb_par@chip/core/strand; 5121414Scindi 5131414Scindi/* #ICTLB_2# 5141414Scindi * An l2tlb parity fault in an I cache can cause: 5151414Scindi * 5161414Scindi * - l2tlb_par : reported by ic on this cpu 5171414Scindi */ 5181414Scindi 5191414Scindi#define ICACHEL2TLB_SB_COUNT 2 5201414Scindi#define ICACHEL2TLB_SB_TIME 168h 5211414Scindi 5227532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.l2tlb_par@chip/core/strand{within(5s)}; 5237532SSean.Ye@Sun.COMengine serd.cpu.amd.l2itlb@chip/core/strand, 5247197Sstephh N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME; 5257532SSean.Ye@Sun.COMevent fault.cpu.amd.l2itlb@chip/core/strand, engine=serd.cpu.amd.l2itlb@chip/core/strand; 5261414Scindi 5277532SSean.Ye@Sun.COMprop fault.cpu.amd.l2itlb@chip/core/strand (0)-> 5287532SSean.Ye@Sun.COM ereport.cpu.amd.ic.l2tlb_par@chip/core/strand; 5291414Scindi 5301414Scindi/* #DCD_SINGLE# 5311414Scindi * A single bit data array fault in an D cache can cause: 5321414Scindi * 5331414Scindi * - data_ecc1 : reported by dc on this cpu by scrubber 5341414Scindi * - data_ecc1_uc : reported by dc on this cpu other than by scrubber 5351414Scindi * 5367197Sstephh * Make data_ecc1_uc fault immediately as it may have caused a panic, so 5377197Sstephh * it is handled by the multi-bit case in the following section. 5381414Scindi */ 5391414Scindi 5407197Sstephh#define DCACHEDATA_SB_COUNT 2 5417197Sstephh#define DCACHEDATA_SB_TIME 168h 5427197Sstephh 5437532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.data_ecc1@chip/core/strand{within(5s)}; 5447532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.data_ecc1_uc@chip/core/strand{within(5s)}; 5457532SSean.Ye@Sun.COMengine serd.cpu.amd.dc_sb@chip/core/strand, 5467197Sstephh N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME; 5477532SSean.Ye@Sun.COMevent fault.cpu.amd.dcachedata@chip/core/strand, engine=serd.cpu.amd.dc_sb@chip/core/strand; 5481414Scindi 5497532SSean.Ye@Sun.COMprop fault.cpu.amd.dcachedata@chip/core/strand (0)-> 5507532SSean.Ye@Sun.COM ereport.cpu.amd.dc.data_ecc1@chip/core/strand; 5511414Scindi 5521414Scindi/* #DCD_MULTI# 5531414Scindi * A multi-bit data array fault in an D cache can cause: 5541414Scindi * 5551414Scindi * - data_eccm : reported by dc on this cpu 5561414Scindi */ 5571414Scindi 5587532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.data_eccm@chip/core/strand{within(5s)}; 5591414Scindi 5607532SSean.Ye@Sun.COMprop fault.cpu.amd.dcachedata@chip/core/strand 5617197Sstephh { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)-> 5627532SSean.Ye@Sun.COM ereport.cpu.amd.dc.data_eccm@chip/core/strand, 5637532SSean.Ye@Sun.COM ereport.cpu.amd.dc.data_ecc1_uc@chip/core/strand; 5641414Scindi 5651414Scindi/* #DCT_PAR# 5661414Scindi * A tag array parity fault in an D cache can cause: 5671414Scindi * 5681414Scindi * - tag_par : reported by dc on this cpu 5691414Scindi */ 5701414Scindi 5717532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.tag_par@chip/core/strand{within(5s)}; 5727532SSean.Ye@Sun.COMevent fault.cpu.amd.dcachetag@chip/core/strand; 5731414Scindi 5747532SSean.Ye@Sun.COMprop fault.cpu.amd.dcachetag@chip/core/strand (1)-> 5757532SSean.Ye@Sun.COM ereport.cpu.amd.dc.tag_par@chip/core/strand; 5761414Scindi 5771414Scindi/* #DCT_SNOOP# 5781414Scindi * A snoop tag array parity fault in an D cache can cause: 5791414Scindi * 5801414Scindi * - stag_par : reported by dc on this cpu 5811414Scindi */ 5821414Scindi 5837532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.stag_par@chip/core/strand{within(5s)}; 5847532SSean.Ye@Sun.COMevent fault.cpu.amd.dcachestag@chip/core/strand; 5851414Scindi 5867532SSean.Ye@Sun.COMprop fault.cpu.amd.dcachestag@chip/core/strand (1)-> 5877532SSean.Ye@Sun.COM ereport.cpu.amd.dc.stag_par@chip/core/strand; 5881414Scindi 5891414Scindi/* #DCTLB_1# 5901414Scindi * An l1tlb parity fault in an D cache can cause: 5911414Scindi * 5921414Scindi * - l1tlb_par : reported by dc on this cpu 5931414Scindi */ 5941414Scindi 5957532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.l1tlb_par@chip/core/strand{within(5s)}; 5967532SSean.Ye@Sun.COMevent fault.cpu.amd.l1dtlb@chip/core/strand; 5971414Scindi 5987532SSean.Ye@Sun.COMprop fault.cpu.amd.l1dtlb@chip/core/strand (1)-> 5997532SSean.Ye@Sun.COM ereport.cpu.amd.dc.l1tlb_par@chip/core/strand; 6001414Scindi 6011414Scindi/* #DCTLB_2# 6021414Scindi * An l2tlb parity fault in an D cache can cause: 6031414Scindi * 6041414Scindi * - l2tlb_par : reported by dc on this cpu 6051414Scindi */ 6061414Scindi 6077532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.l2tlb_par@chip/core/strand{within(5s)}; 6087532SSean.Ye@Sun.COMevent fault.cpu.amd.l2dtlb@chip/core/strand; 6091414Scindi 6107532SSean.Ye@Sun.COMprop fault.cpu.amd.l2dtlb@chip/core/strand (1)-> 6117532SSean.Ye@Sun.COM ereport.cpu.amd.dc.l2tlb_par@chip/core/strand; 6121414Scindi 6132869Sgavinm/* #MISC# 6141414Scindi * Ereports that should not normally happen and which we will discard 6151414Scindi * without diagnosis if they do. These fall into a few categories: 6161414Scindi * 6171414Scindi * - the corresponding detector is not enabled, typically because 6181414Scindi * detection/handling of the event is taking place elsewhere 6191414Scindi * (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk) 6201414Scindi * - the event is associated with a sync flood so even if the detector is 6211414Scindi * enabled we will never handle the event and generate an ereport *and* 6221414Scindi * even if the ereport did arrive we could perform no useful diagnosis 6231414Scindi * e.g., the NB can be configured for sync flood on nb.mem_eccm 6241414Scindi * but we don't choose to discard that ereport here since we could have 6251414Scindi * made a useful diagnosis from it had it been delivered 6261414Scindi * (nb.ht_sync, nb.ht_crc) 6271414Scindi * - events that will be accompanied by an immediate panic and 6281414Scindi * delivery of the ereport during subsequent reboot but from 6291414Scindi * which no useful diagnosis can be made. (nb.rmw, nb.wdog) 6301414Scindi * 6311414Scindi * Ereports for all of these can be generated by error simulation and 6321414Scindi * injection. We will perform a null diagnosos of all these ereports in order 6331414Scindi * to avoid "no subscription" complaints during test harness runs. 6341414Scindi */ 6351414Scindi 6367532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.ma@strand{within(5s)}; 6377532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.ta@strand{within(5s)}; 6387532SSean.Ye@Sun.COMevent ereport.cpu.amd.ls.s_rde@strand{within(5s)}; 6397532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.rdde@strand{within(5s)}; 6407532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.s_rde@strand{within(5s)}; 6417532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.gart_walk@strand{within(5s)}; 6427532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.ht_sync@strand{within(5s)}; 6437532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.ht_crc@strand{within(5s)}; 6447532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.rmw@strand{within(5s)}; 6457532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.wdog@strand{within(5s)}; 6467532SSean.Ye@Sun.COMevent ereport.cpu.amd.unknown@strand{within(5s)}; 6471414Scindi 6487532SSean.Ye@Sun.COMevent upset.null_diag@strand; 6491414Scindi 6507532SSean.Ye@Sun.COMprop upset.null_diag@strand (1)-> 6517532SSean.Ye@Sun.COM ereport.cpu.amd.nb.ma@strand, 6527532SSean.Ye@Sun.COM ereport.cpu.amd.nb.ta@strand, 6537532SSean.Ye@Sun.COM ereport.cpu.amd.ls.s_rde@strand, 6547532SSean.Ye@Sun.COM ereport.cpu.amd.ic.rdde@strand, 6557532SSean.Ye@Sun.COM ereport.cpu.amd.bu.s_rde@strand, 6567532SSean.Ye@Sun.COM ereport.cpu.amd.nb.gart_walk@strand, 6577532SSean.Ye@Sun.COM ereport.cpu.amd.nb.ht_sync@strand, 6587532SSean.Ye@Sun.COM ereport.cpu.amd.nb.ht_crc@strand, 6597532SSean.Ye@Sun.COM ereport.cpu.amd.nb.rmw@strand, 6607532SSean.Ye@Sun.COM ereport.cpu.amd.nb.wdog@strand, 6617532SSean.Ye@Sun.COM ereport.cpu.amd.unknown@strand; 662