xref: /onnv-gate/usr/src/cmd/fm/eversholt/files/i386/i86pc/amd64.esc (revision 11947:e9d33e5d3842)
11414Scindi/*
21414Scindi * CDDL HEADER START
31414Scindi *
41414Scindi * The contents of this file are subject to the terms of the
51493Sgavinm * Common Development and Distribution License (the "License").
61493Sgavinm * You may not use this file except in compliance with the License.
71414Scindi *
81414Scindi * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
91414Scindi * or http://www.opensolaris.org/os/licensing.
101414Scindi * See the License for the specific language governing permissions
111414Scindi * and limitations under the License.
121414Scindi *
131414Scindi * When distributing Covered Code, include this CDDL HEADER in each
141414Scindi * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
151414Scindi * If applicable, add the following below this CDDL HEADER, with the
161414Scindi * fields enclosed by brackets "[]" replaced with your own identifying
171414Scindi * information: Portions Copyright [yyyy] [name of copyright owner]
181414Scindi *
191414Scindi * CDDL HEADER END
201414Scindi */
211414Scindi
221414Scindi/*
23*11947Ssrihari.venkatesan@oracle.com * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
241414Scindi * Use is subject to license terms.
251414Scindi */
261414Scindi
271414Scindi#pragma dictionary "AMD"
281414Scindi
291414Scindi/*
301414Scindi * Eversholt rules for the AMD Opteron CPU/Memory
311414Scindi */
321414Scindi
332869Sgavinm#define	MAX(x, y) ((x) >= (y) ? (x) : (y))
342869Sgavinm#define	MIN(x, y) ((x) <= (y) ? (x) : (y))
352869Sgavinm
362869Sgavinm/*
372869Sgavinm * SET_ADDR and SET_OFFSET are used to set a payload value in the fault that
382869Sgavinm * we diagnose for page faults, to record the physical address of the faulting
397197Sstephh * page.
401414Scindi */
415254Sgavinm#define	SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR")))
421414Scindi
434436Sstephh#define	SET_OFFSET (setpayloadprop("asru-offset", \
444436Sstephh	payloadprop("resource[0].hc-specific.offset")))
451414Scindi
461414Scindi/*
472869Sgavinm * RESOURCE_EXISTS is true if a member with name "resource" exists in the
481414Scindi * payload - regardless of type (e.g., nvlist or nvlist array) or value.
491414Scindi */
501414Scindi#define	RESOURCE_EXISTS	(payloadprop_defined("resource"))
511414Scindi
521414Scindi/*
532869Sgavinm * CONTAINS_RANK is true if the "resource" nvlist array (as used in memory
541414Scindi * ereports) exists and one if its members matches the path for the
552869Sgavinm * rank node.  Our memory propogation are of the form
562869Sgavinm *
577532SSean.Ye@Sun.COM * "prop foo@chip/memory-controller/dimm/rank -> blah@chip/core/strand"
582869Sgavinm *
591414Scindi * since cpus detect memory errors;  in eversholt such a propogation, where
601414Scindi * the lhs path and rhs path do not match, expands to the cross-product of
612869Sgavinm * all dimms, ranks and cpus on the same chip (since chip appears in the
622869Sgavinm * path on both sides).  We use CONTAINS_RANK to constrain the propogation
632869Sgavinm * such that it only happens if the payload resource matches the rank.
641414Scindi */
652869Sgavinm#define	CONTAINS_RANK (payloadprop_contains("resource", \
664436Sstephh	asru(chip/memory-controller/dimm/rank)) \
674436Sstephh	|| payloadprop_contains("resource", \
684436Sstephh	asru(chip/memory-controller/dimm)))
691414Scindi
701414Scindi/*
711414Scindi * The following will tell us whether a syndrome that is known to be
722869Sgavinm * correctable (from a mem_ce ereport) is single-bit or multi-bit.  For a
731414Scindi * correctable ChipKill syndrome the number of bits set in the lowest
742869Sgavinm * nibble indicates how many bits were in error.
751414Scindi */
761414Scindi
771414Scindi#define	CBITMASK(synd) ((synd) & 0xf)
781414Scindi
791414Scindi#define	CKSINGLE(synd)							\
801414Scindi	((synd) == 0 ||							\
811414Scindi	(CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 ||		\
821414Scindi	CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8))
831414Scindi
841414Scindi#define	SINGLE_BIT_CE							\
851414Scindi	(payloadprop("syndrome-type") == "E" ||				\
86*11947Ssrihari.venkatesan@oracle.com	(payloadprop("syndrome-type") == "C4" &&			\
871414Scindi	CKSINGLE(payloadprop("syndrome"))))
881414Scindi
891414Scindi#define	MULTI_BIT_CE							\
90*11947Ssrihari.venkatesan@oracle.com	(payloadprop("syndrome-type") == "C4" &&			\
911414Scindi	!CKSINGLE(payloadprop("syndrome")))
921414Scindi
937197Sstephh/*								#PAGE#
947197Sstephh *								#DIMM_SCU#
952869Sgavinm * A single bit fault in a memory rank can cause:
961414Scindi *
972869Sgavinm *  - mem_ce : reported by nb
982869Sgavinm *  - inf_sys_ecc1: reported by ic or dc; inf_sys_ecc1 errors detected at the
992869Sgavinm *    ic do not record a syndrome; these errors will not be triggered in
1002869Sgavinm *    ChipKill ECC mode (the NB corrects all ECC errors in that mode)
1012869Sgavinm *  - s_ecc1: reported by bu; this error will not be triggered in ChipKill
1022869Sgavinm *    ECC mode (the NB corrects all ECC in that mode)
1031414Scindi *
1042869Sgavinm * Single-bit errors are fed into a per-rank SERD engine; if a SERD engine
1051414Scindi * trips we diagnose a fault.memory.page so that the response agent can
1061414Scindi * retire the page that caused the trip.  If the total number of pages
1072869Sgavinm * faulted in this way on a single rank exceeds a threshold we will
1087197Sstephh * diagnose a fault.memory.dimm_sb against the containing dimm.
1091414Scindi *
1102869Sgavinm * Multibit ChipKill-correctable errors are treated identically to
1112869Sgavinm * single-bit errors, but via separate serd engines to allow distinct
1122869Sgavinm * parameters if desired.
1131414Scindi *
1141414Scindi * Uncorrectable errors produce an immediate page fault and corresponding
1151414Scindi * fault.memory.dimm_ue.
1161414Scindi *
1171414Scindi * Page faults are essentially internal - action is only required when
1181414Scindi * they are accompanied by a dimm fault.  As such we include message=0
1192869Sgavinm * on page faults.
1201414Scindi */
1211414Scindi
1227532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.inf_sys_ecc1@chip/core/strand{within(5s)};
1237532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand{within(5s)};
1247532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.s_ecc1@chip/core/strand{within(5s)};
1257532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.mem_ce@chip/core/strand{within(5s)};
1261414Scindi
1271414Scindi/*
1287197Sstephh * Single-bit correctable errors feed into per-rank
1292869Sgavinm * SERD engines which diagnose fault.memory.page_sb if they trip.
1301414Scindi *
1317197Sstephh * Multi-bit correctable (via ChipKill) errors feed
1322869Sgavinm * into additional per-rank SERD engines which diagnose fault.memory.page_ck
1332869Sgavinm * if they trip.
1342869Sgavinm *
1352869Sgavinm * The number of fault.memory.page and fault.memory.page_ck diagnosed is
1362869Sgavinm * counted in stat engines for each type.  These are used in deciding
1372869Sgavinm * whether to declare a dimm faulty after repeated page faults.
1381414Scindi */
1391414Scindi
1401414Scindi#define PAGE_SB_COUNT		2
1411414Scindi#define PAGE_SB_TIME		72h
1422869Sgavinm#define	PAGE_CK_COUNT		2
1432869Sgavinm#define	PAGE_CK_TIME		72h
1441414Scindi
1457197Sstephhengine stat.sbpgflt@chip/memory-controller/dimm/rank;
1467197Sstephhengine stat.ckpgflt@chip/memory-controller/dimm/rank;
1477197Sstephhengine serd.memory.page_sb@chip/memory-controller/dimm/rank,
1487197Sstephh    N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
1497197Sstephhengine serd.memory.page_ck@chip/memory-controller/dimm/rank,
1507197Sstephh    N=PAGE_CK_COUNT, T=PAGE_CK_TIME;
1517197Sstephhengine serd.memory.dimm_sb@chip/memory-controller/dimm/rank,
1527197Sstephh    N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
1537197Sstephhengine serd.memory.dimm_ck@chip/memory-controller/dimm/rank,
1547197Sstephh    N=PAGE_CK_COUNT, T=PAGE_CK_TIME;
1557197Sstephhevent fault.memory.page_sb@chip/memory-controller/dimm/rank, message=0,
1567197Sstephh    count=stat.sbpgflt@chip/memory-controller/dimm/rank, response=0,
1577197Sstephh    engine=serd.memory.page_sb@chip/memory-controller/dimm/rank;
1587197Sstephhevent fault.memory.page_ck@chip/memory-controller/dimm/rank, message=0,
1597197Sstephh    count=stat.ckpgflt@chip/memory-controller/dimm/rank, response=0,
1607197Sstephh    engine=serd.memory.page_ck@chip/memory-controller/dimm/rank;
1617197Sstephhevent fault.memory.dimm_sb@chip/memory-controller/dimm/rank,
1627197Sstephh    engine=serd.memory.dimm_sb@chip/memory-controller/dimm/rank;
1637197Sstephhevent fault.memory.dimm_ck@chip/memory-controller/dimm/rank,
1647197Sstephh    engine=serd.memory.dimm_ck@chip/memory-controller/dimm/rank;
1657197Sstephh
1662869Sgavinm/*
1672869Sgavinm * The fraction of pages on a single rank that must be diagnosed as faulty
1682869Sgavinm * with single correctable unit faults before we will fault the rank.
1692869Sgavinm * Once we have faulted the rank we will continue to diagnose any further page
1702869Sgavinm * faults on the rank up to some maximum multiple of the threshold at which
1712869Sgavinm * we faulted the dimm.  This allows us to potentially contain some fairly
1722869Sgavinm * far-reaching but still limited-extent fault (such as a partial column
1732869Sgavinm * failure) without getting carried away and allowing a single faulty rank to
1742869Sgavinm * use up the entire system-imposed page retirenment limit (which, once
1752869Sgavinm * reached, causes retirement request to have no effect other than to fill
1762869Sgavinm * the fault manager cache and logs).
1772869Sgavinm *
1782869Sgavinm * This fraction is specified in basis points, where 100 basis points are
1792869Sgavinm * equivalent to 1 percent.  It is applied on a per-rank basis.
1802869Sgavinm *
1812869Sgavinm * The system imposes an absolute maximum on the number of pages it will
1822869Sgavinm * retire;  the current value is 10 basis points, or 0.1% of 'physmem'.  Note
1832869Sgavinm * that 'physmem' is reduced from installed memory pages by an amount
1842869Sgavinm * reflecting permanent kernel memory allocations.  This system page retire
1852869Sgavinm * limit bounds the maximum real response to page faults across all ranks
1862869Sgavinm * that fault manager response agents can effect, but it should not be confused
1872869Sgavinm * with any diagnosis threshold (i.e., the number of faulty pages we are
1882869Sgavinm * prepared to tolerate from a single rank before faulting the rank is
1892869Sgavinm * distinct from the total number of pages we are prepared to retire from use
1902869Sgavinm * in response to that and other faults).  It is, however, desirable to
1912869Sgavinm * arrange that the maximum number of pages we are prepared to fault from
1922869Sgavinm * any one rank is less than the system-wide quota.
1932869Sgavinm */
1942869Sgavinm#define	PAGE_RETIRE_LIMIT_BPS	5		/* or 0.05%; ~ 131 pages/GB %/
1952869Sgavinm
1962869Sgavinm/*
1972869Sgavinm * A macro to manipulate the above fraction.  Given a size in bytes convert
1982869Sgavinm * this to pages (4K pagesize) and calculate the number of those pages
1992869Sgavinm * indicated by PAGE_RETIRE_LIMIT_BPS basis points.
2002869Sgavinm */
2012869Sgavinm#define	_BPS_PGCNT(totalbytes) \
2022869Sgavinm	((((totalbytes) / 4096 ) * PAGE_RETIRE_LIMIT_BPS) / 10000)
2031414Scindi
2042869Sgavinm/*
2052869Sgavinm * The single-correctable-unit threshold at which number of faulted pages
2062869Sgavinm * on a rank we we fault the rank.  We insist that this be at least 128 and
2072869Sgavinm * never more than 512.
2082869Sgavinm */
2092869Sgavinm#define	RANK_THRESH MIN(512, MAX(128, \
2107197Sstephh	_BPS_PGCNT(confprop(chip/memory-controller/dimm/rank, "size"))))
2112869Sgavinm
2122869Sgavinm/*
2132869Sgavinm * The maximum number of single-correctable-unit page faults we will diagnose
2142869Sgavinm * on a single rank (must be greater than RANK_THRESH).  We set
2152869Sgavinm * this at twice the rank fault threshold.
2162869Sgavinm */
2172869Sgavinm#define	RANK_PGFLT_MAX (2 * RANK_THRESH)
2182869Sgavinm
2192869Sgavinm#define	SB_PGFLTS (count(stat.sbpgflt@chip/memory-controller/dimm/rank))
2202869Sgavinm#define	CK_PGFLTS (count(stat.ckpgflt@chip/memory-controller/dimm/rank))
2212869Sgavinm
2227197Sstephh/*
2232869Sgavinm * "Single-correctable-unit" DIMM faults are diagnosed when the total number of
2242869Sgavinm * page faults (diagnosed from repeated single-bit or multibit-chipkills)
2252869Sgavinm * from any one rank on that DIMM reaches a threshold.  A "correctable unit"
2262869Sgavinm * is a single bit in normal 64/8 ECC mode, or a single symbol in ChipKill
2272869Sgavinm * 128/16 mode (i.e., nibble-aligned nibble for the code used on Opteron).
2281414Scindi *
2292869Sgavinm * We do not stop diagnosing further single-bit page faults once we have
2302869Sgavinm * declared a single-bit DIMM fault - we continue diagnosing them and
2312869Sgavinm * response agents can continue to retire those pages up to the system-imposed
2322869Sgavinm * retirement limit.
2332869Sgavinm *
2342869Sgavinm * Two distinct fault types may be diagnosed - fault.memory.dimm_sb and
2352869Sgavinm * fault.memory.dimm_ck.  Which one is diagnosed depends on whether we
2362869Sgavinm * have reached the threshold for a majority of single-bit page faults or
2372869Sgavinm * multibit page faults.
2382869Sgavinm *
2392869Sgavinm * Implementation: we maintain parallel SERD engines to the page_sb and
2402869Sgavinm * page_ck engines, which trip in unison.  On trip it generates a distinct
2417197Sstephh * ereport which we diagnose to a fault if the threshold has been reached.
2421414Scindi */
2437197Sstephhprop fault.memory.page_sb@chip/memory-controller/dimm/rank
2447197Sstephh    { CONTAINS_RANK && SINGLE_BIT_CE &&
2457197Sstephh      SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)->
2469078SStephen.Hanson@Sun.COM    ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>,
2479078SStephen.Hanson@Sun.COM    ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>,
2489078SStephen.Hanson@Sun.COM    ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
2491414Scindi
2507197Sstephhprop fault.memory.page_ck@chip/memory-controller/dimm/rank
2517197Sstephh    { CONTAINS_RANK && !SINGLE_BIT_CE &&
2527197Sstephh      SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)->
2539078SStephen.Hanson@Sun.COM    ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>,
2549078SStephen.Hanson@Sun.COM    ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>,
2559078SStephen.Hanson@Sun.COM    ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
2561414Scindi
2577197Sstephhprop fault.memory.dimm_sb@chip/memory-controller/dimm/rank
2587197Sstephh    { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH &&
2597197Sstephh      SB_PGFLTS > RANK_THRESH / 2 } (1)->
2609078SStephen.Hanson@Sun.COM    ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>,
2619078SStephen.Hanson@Sun.COM    ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>,
2629078SStephen.Hanson@Sun.COM    ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
2637197Sstephh
2647197Sstephhprop fault.memory.dimm_ck@chip/memory-controller/dimm/rank
2657197Sstephh    { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH &&
2667197Sstephh      CK_PGFLTS > RANK_THRESH / 2 } (1)->
2679078SStephen.Hanson@Sun.COM    ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
2682869Sgavinm
2692869Sgavinm/*
2707197Sstephh * If the address is not valid then no resource member will be included
2717197Sstephh * in a nb.mem_ce or nb.mem_ue ereport.  These cases should be rare.
2727197Sstephh * We will also discard all inf_sys_ecc1 events detected at the ic since they
2737197Sstephh * have no syndrome and therefore no resource information.
2747197Sstephh * We will discard such ereports.  An alternative may be to SERD them
2757197Sstephh * on a per MC basis and trip if we see too many such events.
2761414Scindi */
2777532SSean.Ye@Sun.COMevent upset.memory.discard1@chip/core/strand;
2787532SSean.Ye@Sun.COMprop upset.memory.discard1@chip/core/strand
2797197Sstephh    { !RESOURCE_EXISTS } (1)->
2807532SSean.Ye@Sun.COM    ereport.cpu.amd.ic.inf_sys_ecc1@chip/core/strand,
2817532SSean.Ye@Sun.COM    ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand,
2827532SSean.Ye@Sun.COM    ereport.cpu.amd.bu.s_ecc1@chip/core/strand,
2837532SSean.Ye@Sun.COM    ereport.cpu.amd.nb.mem_ce@chip/core/strand;
2841414Scindi
2851414Scindi/* 								#DIMM_UE#
2862869Sgavinm *								#PAGE_UE#
2872869Sgavinm * An uncorrectable multi-bit fault in a memory dimm can cause:
2881414Scindi *
2892869Sgavinm *  - mem_ue    	   : reported by nb for an access from a remote cpu
2902869Sgavinm *  - inf_sys_eccm : reported by ic or dc; the ic does not report a syndrome
2912869Sgavinm *  - s_eccm	   : reported by bu
2921414Scindi *
2932869Sgavinm * Since on production systems we force HT Sync Flood on uncorrectable
2942869Sgavinm * memory errors (if not already set as such by the BIOS, as it should be)
2952869Sgavinm * we won't actually receive these ereports since the system will be reset.
2961414Scindi */
2971414Scindi
2987532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand{within(5s)};
2997532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand{within(5s)};
3007532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.s_eccm@chip/core/strand{within(5s)};
3017532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.mem_ue@chip/core/strand{within(5s)};
3022869Sgavinm
3037197Sstephhevent fault.memory.dimm_ue@chip/memory-controller/dimm/rank;
3047197Sstephhevent fault.memory.page_ue@chip/memory-controller/dimm/rank, message=0,
3057197Sstephh    response=0;
3062869Sgavinm
3077197Sstephhprop fault.memory.dimm_ue@chip/memory-controller/dimm/rank
3087197Sstephh    { CONTAINS_RANK } (1)->
3099078SStephen.Hanson@Sun.COM    ereport.cpu.amd.ic.inf_sys_eccm@chip/core<>/strand<>,
3109078SStephen.Hanson@Sun.COM    ereport.cpu.amd.dc.inf_sys_eccm@chip/core<>/strand<>,
3119078SStephen.Hanson@Sun.COM    ereport.cpu.amd.bu.s_eccm@chip/core<>/strand<>,
3129078SStephen.Hanson@Sun.COM    ereport.cpu.amd.nb.mem_ue@chip/core<>/strand<>;
3132869Sgavinm
3147197Sstephhprop fault.memory.page_ue@chip/memory-controller/dimm/rank
3157197Sstephh    { CONTAINS_RANK && SET_ADDR && SET_OFFSET } (1)->
3169078SStephen.Hanson@Sun.COM    ereport.cpu.amd.ic.inf_sys_eccm@chip/core<>/strand<>,
3179078SStephen.Hanson@Sun.COM    ereport.cpu.amd.dc.inf_sys_eccm@chip/core<>/strand<>,
3189078SStephen.Hanson@Sun.COM    ereport.cpu.amd.bu.s_eccm@chip/core<>/strand<>,
3199078SStephen.Hanson@Sun.COM    ereport.cpu.amd.nb.mem_ue@chip/core<>/strand<>;
3201414Scindi
3217532SSean.Ye@Sun.COMevent upset.memory.discard3@chip/core/strand;
3227532SSean.Ye@Sun.COMprop upset.memory.discard3@chip/core/strand
3234436Sstephh    { !RESOURCE_EXISTS } (1)->
3247532SSean.Ye@Sun.COM    ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand,
3257532SSean.Ye@Sun.COM    ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand,
3267532SSean.Ye@Sun.COM    ereport.cpu.amd.bu.s_eccm@chip/core/strand,
3277532SSean.Ye@Sun.COM    ereport.cpu.amd.nb.mem_ue@chip/core/strand;
3281414Scindi
3292869Sgavinm/*								#CSTESTFAIL#
3302869Sgavinm * If the BIOS fails a chip-select during POST, or perhaps after a
3312869Sgavinm * sync flood from an uncorrectable error, then on revision F and G it
3322869Sgavinm * should mark that chip-select as TestFail in the CS Base register.
3332869Sgavinm * When the memory-controller driver discovers all the MC configuration
3342869Sgavinm * it notes such failed chip-selects and creates topology nodes for the
3352869Sgavinm * chip-select and associated dimms and ranks, and produces an ereport for each
3362869Sgavinm * failed chip-select with detector set to the memory-controller node
3372869Sgavinm * and resource indicating the failed chip-select.
3382869Sgavinm */
3392869Sgavinm
3404436Sstephhevent ereport.cpu.amd.mc.cs_testfail@chip/memory-controller{within(5s)};
3417197Sstephhevent fault.memory.dimm_testfail@chip/memory-controller/dimm/rank;
3422869Sgavinmevent error.memory.cs_testfail@chip/memory-controller/chip-select;
3432869Sgavinm
3442869Sgavinm#define	CONTAINS_CS (payloadprop_contains("resource", \
3452869Sgavinm	asru(chip/memory-controller/chip-select)))
3461414Scindi
3474436Sstephhprop error.memory.cs_testfail@chip/memory-controller/chip-select (1)->
3482869Sgavinm    ereport.cpu.amd.mc.cs_testfail@chip/memory-controller
3492869Sgavinm    { CONTAINS_CS };
3502869Sgavinm
3512869Sgavinm#define CSMATCH(s) \
3527197Sstephh	(confprop_defined(chip/memory-controller/chip-select, s) && \
3537197Sstephh	confprop(chip/memory-controller/chip-select, s) == \
3547197Sstephh	confprop(chip/memory-controller/dimm/rank, "csname"))
3552869Sgavinm
3569078SStephen.Hanson@Sun.COMprop fault.memory.dimm_testfail@chip/memory-controller/dimm/rank (0)->
3572869Sgavinm    error.memory.cs_testfail@chip/memory-controller/chip-select
3582869Sgavinm    { CSMATCH("dimm1-csname") || CSMATCH("dimm2-csname")};
3591414Scindi
3602869Sgavinm/*								#ADDRPAR#
3612869Sgavinm * DRAM Command/Address Parity Errors.
3622869Sgavinm *
3632869Sgavinm *  - dramaddr_par : reported by the nb; the NB status register includes
3642869Sgavinm *    a bit indicating which dram controller channel (A or B) experienced
3652869Sgavinm *    the error.
3662869Sgavinm */
3672869Sgavinm
3687532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.dramaddr_par@chip/core/strand{within(5s)};
3697197Sstephhevent fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel, response=0;
3702869Sgavinm
3712869Sgavinmprop fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel[y] (0)->
3727532SSean.Ye@Sun.COM    ereport.cpu.amd.nb.dramaddr_par@chip/core/strand {
3737197Sstephh    ((payloadprop("IA32_MCi_STATUS") >> 32 & 0x200) ? 1 : 0) == y };
3741414Scindi
3751414Scindi/* 								#L2D_SINGLE#
3761414Scindi * A single bit data array fault in an l2 cache can cause:
3771414Scindi *
3781414Scindi *  - inf_l2_ecc1 : reported by ic on this cpu
3791414Scindi *  - inf_l2_ecc1 : reported by dc on this cpu
3801414Scindi *  - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu
3811414Scindi */
3821414Scindi
3837197Sstephh#define L2CACHEDATA_SB_COUNT	3
3847197Sstephh#define L2CACHEDATA_SB_TIME	12h
3857197Sstephh
3867532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.inf_l2_ecc1@chip/core/strand{within(5s)};
3877532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.inf_l2_ecc1@chip/core/strand{within(5s)};
3887532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.l2d_ecc1@chip/core/strand{within(5s)};
3897532SSean.Ye@Sun.COMengine serd.cpu.amd.l2d_sb@chip/core/strand,
3907197Sstephh    N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME;
3917532SSean.Ye@Sun.COMevent fault.cpu.amd.l2cachedata@chip/core/strand, engine=serd.cpu.amd.l2d_sb@chip/core/strand;
3921414Scindi
3937532SSean.Ye@Sun.COMprop fault.cpu.amd.l2cachedata@chip/core/strand (0)->
3947532SSean.Ye@Sun.COM    ereport.cpu.amd.ic.inf_l2_ecc1@chip/core/strand,
3957532SSean.Ye@Sun.COM    ereport.cpu.amd.dc.inf_l2_ecc1@chip/core/strand,
3967532SSean.Ye@Sun.COM    ereport.cpu.amd.bu.l2d_ecc1@chip/core/strand;
3971414Scindi
3981414Scindi/* 								#L2D_MULTI#
3991414Scindi * A multi-bit data array fault in an l2 cache can cause:
4001414Scindi *
4011414Scindi *  - inf_l2_eccm : reported by ic on this cpu
4021414Scindi *  - inf_l2_eccm : reported by dc on this cpu
4031414Scindi *  - l2d_eccm : reported by bu on copyback or on snoop from another cpu
4041414Scindi */
4051414Scindi
4067532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.inf_l2_eccm@chip/core/strand{within(5s)};
4077532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.inf_l2_eccm@chip/core/strand{within(5s)};
4087532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.l2d_eccm@chip/core/strand{within(5s)};
4091414Scindi
4107532SSean.Ye@Sun.COMprop fault.cpu.amd.l2cachedata@chip/core/strand
4117197Sstephh    { setserdincrement(L2CACHEDATA_SB_COUNT + 1) } (0)->
4127532SSean.Ye@Sun.COM    ereport.cpu.amd.ic.inf_l2_eccm@chip/core/strand,
4137532SSean.Ye@Sun.COM    ereport.cpu.amd.dc.inf_l2_eccm@chip/core/strand,
4147532SSean.Ye@Sun.COM    ereport.cpu.amd.bu.l2d_eccm@chip/core/strand;
4151414Scindi
4161414Scindi/* 								#L2T_SINGLE#
4171414Scindi * A single bit tag array fault in an l2 cache can cause:
4181414Scindi *
4191414Scindi *  - l2t_ecc1 : reported by bu on this cpu when detected during snoop
4201414Scindi *  - l2t_par : reported by bu on this cpu when detected other than during snoop
4211414Scindi */
4221414Scindi
4237197Sstephh#define L2CACHETAG_SB_COUNT	3
4247197Sstephh#define L2CACHETAG_SB_TIME	12h
4257197Sstephh
4267532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.l2t_ecc1@chip/core/strand{within(5s)};
4277532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.l2t_par@chip/core/strand{within(5s)};
4287532SSean.Ye@Sun.COMengine serd.cpu.amd.l2t_sb@chip/core/strand,
4297197Sstephh    N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME;
4307532SSean.Ye@Sun.COMevent fault.cpu.amd.l2cachetag@chip/core/strand, engine=serd.cpu.amd.l2t_sb@chip/core/strand;
4311414Scindi
4327532SSean.Ye@Sun.COMprop fault.cpu.amd.l2cachetag@chip/core/strand (0)->
4337913SStephen.Hanson@Sun.COM    ereport.cpu.amd.bu.l2t_ecc1@chip/core/strand;
4341414Scindi
4351414Scindi/* 								#L2T_MULTI#
4361414Scindi * A multi-bit tag array fault in an l2 cache can cause:
4371414Scindi *
4381414Scindi *  - l2t_eccm : reported by bu on this cpu when detected during snoop
4391414Scindi *  - l2t_par : reported by bu on this cpu when detected other than during snoop
4401414Scindi */
4411414Scindi
4427532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.l2t_eccm@chip/core/strand{within(5s)};
4431414Scindi
4447532SSean.Ye@Sun.COMprop fault.cpu.amd.l2cachetag@chip/core/strand
4457197Sstephh    { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)->
4467532SSean.Ye@Sun.COM    ereport.cpu.amd.bu.l2t_eccm@chip/core/strand,
4477532SSean.Ye@Sun.COM    ereport.cpu.amd.bu.l2t_par@chip/core/strand;
4481414Scindi
4491414Scindi/* 								#ICD_PAR#
4501414Scindi * A data array parity fault in an I cache can cause:
4511414Scindi *
4521414Scindi *  - data_par : reported by ic on this cpu
4531414Scindi */
4541414Scindi
4551414Scindi#define ICACHEDATA_SB_COUNT	2
4561414Scindi#define ICACHEDATA_SB_TIME	168h
4571414Scindi
4587532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.data_par@chip/core/strand{within(5s)};
4597532SSean.Ye@Sun.COMengine serd.cpu.amd.icachedata@chip/core/strand,
4607197Sstephh    N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME;
4617532SSean.Ye@Sun.COMevent fault.cpu.amd.icachedata@chip/core/strand,
4627532SSean.Ye@Sun.COM    engine=serd.cpu.amd.icachedata@chip/core/strand;
4631414Scindi
4647532SSean.Ye@Sun.COMprop fault.cpu.amd.icachedata@chip/core/strand (0)->
4657532SSean.Ye@Sun.COM    ereport.cpu.amd.ic.data_par@chip/core/strand;
4661414Scindi
4671414Scindi/* 								#ICT_PAR#
4681414Scindi * A tag array parity fault in an I cache can cause:
4691414Scindi *
4701414Scindi *  - tag_par : reported by ic on this cpu
4711414Scindi */
4721414Scindi
4731414Scindi#define ICACHETAG_SB_COUNT	2
4741414Scindi#define ICACHETAG_SB_TIME	168h
4751414Scindi
4767532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.tag_par@chip/core/strand{within(5s)};
4777532SSean.Ye@Sun.COMengine serd.cpu.amd.icachetag@chip/core/strand,
4787197Sstephh    N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME;
4797532SSean.Ye@Sun.COMevent fault.cpu.amd.icachetag@chip/core/strand, engine=serd.cpu.amd.icachetag@chip/core/strand;
4801414Scindi
4817532SSean.Ye@Sun.COMprop fault.cpu.amd.icachetag@chip/core/strand (0)->
4827532SSean.Ye@Sun.COM    ereport.cpu.amd.ic.tag_par@chip/core/strand;
4831414Scindi
4841414Scindi/* 								#ICT_SNOOP#
4851414Scindi * A snoop tag array parity fault in an I cache can cause:
4861414Scindi *
4871414Scindi *  - stag_par : reported by ic on this cpu
4881414Scindi */
4891414Scindi
4907532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.stag_par@chip/core/strand{within(5s)};
4917532SSean.Ye@Sun.COMevent fault.cpu.amd.icachestag@chip/core/strand;
4921414Scindi
4937532SSean.Ye@Sun.COMprop fault.cpu.amd.icachestag@chip/core/strand (1)->
4947532SSean.Ye@Sun.COM    ereport.cpu.amd.ic.stag_par@chip/core/strand;
4951414Scindi
4961414Scindi/* 								#ICTLB_1#
4971414Scindi * An l1tlb parity fault in an I cache can cause:
4981414Scindi *
4991414Scindi *  - l1tlb_par : reported by ic on this cpu
5001414Scindi */
5011414Scindi
5021414Scindi#define ICACHEL1TLB_SB_COUNT	2
5031414Scindi#define ICACHEL1TLB_SB_TIME	168h
5041414Scindi
5057532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.l1tlb_par@chip/core/strand{within(5s)};
5067532SSean.Ye@Sun.COMengine serd.cpu.amd.l1itlb@chip/core/strand,
5077197Sstephh    N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME;
5087532SSean.Ye@Sun.COMevent fault.cpu.amd.l1itlb@chip/core/strand, engine=serd.cpu.amd.l1itlb@chip/core/strand;
5091414Scindi
5107532SSean.Ye@Sun.COMprop fault.cpu.amd.l1itlb@chip/core/strand (0)->
5117532SSean.Ye@Sun.COM    ereport.cpu.amd.ic.l1tlb_par@chip/core/strand;
5121414Scindi
5131414Scindi/* 								#ICTLB_2#
5141414Scindi * An l2tlb parity fault in an I cache can cause:
5151414Scindi *
5161414Scindi *  - l2tlb_par : reported by ic on this cpu
5171414Scindi */
5181414Scindi
5191414Scindi#define ICACHEL2TLB_SB_COUNT	2
5201414Scindi#define ICACHEL2TLB_SB_TIME	168h
5211414Scindi
5227532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.l2tlb_par@chip/core/strand{within(5s)};
5237532SSean.Ye@Sun.COMengine serd.cpu.amd.l2itlb@chip/core/strand,
5247197Sstephh    N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME;
5257532SSean.Ye@Sun.COMevent fault.cpu.amd.l2itlb@chip/core/strand, engine=serd.cpu.amd.l2itlb@chip/core/strand;
5261414Scindi
5277532SSean.Ye@Sun.COMprop fault.cpu.amd.l2itlb@chip/core/strand (0)->
5287532SSean.Ye@Sun.COM    ereport.cpu.amd.ic.l2tlb_par@chip/core/strand;
5291414Scindi
5301414Scindi/* 								#DCD_SINGLE#
5311414Scindi * A single bit data array fault in an D cache can cause:
5321414Scindi *
5331414Scindi *  - data_ecc1 : reported by dc on this cpu by scrubber
5341414Scindi *  - data_ecc1_uc : reported by dc on this cpu other than by scrubber
5351414Scindi *
5367197Sstephh * Make data_ecc1_uc fault immediately as it may have caused a panic, so
5377197Sstephh * it is handled by the multi-bit case in the following section.
5381414Scindi */
5391414Scindi
5407197Sstephh#define DCACHEDATA_SB_COUNT	2
5417197Sstephh#define DCACHEDATA_SB_TIME	168h
5427197Sstephh
5437532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.data_ecc1@chip/core/strand{within(5s)};
5447532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.data_ecc1_uc@chip/core/strand{within(5s)};
5457532SSean.Ye@Sun.COMengine serd.cpu.amd.dc_sb@chip/core/strand,
5467197Sstephh    N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME;
5477532SSean.Ye@Sun.COMevent fault.cpu.amd.dcachedata@chip/core/strand, engine=serd.cpu.amd.dc_sb@chip/core/strand;
5481414Scindi
5497532SSean.Ye@Sun.COMprop fault.cpu.amd.dcachedata@chip/core/strand (0)->
5507532SSean.Ye@Sun.COM    ereport.cpu.amd.dc.data_ecc1@chip/core/strand;
5511414Scindi
5521414Scindi/* 								#DCD_MULTI#
5531414Scindi * A multi-bit data array fault in an D cache can cause:
5541414Scindi *
5551414Scindi *  - data_eccm : reported by dc on this cpu
5561414Scindi */
5571414Scindi
5587532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.data_eccm@chip/core/strand{within(5s)};
5591414Scindi
5607532SSean.Ye@Sun.COMprop fault.cpu.amd.dcachedata@chip/core/strand
5617197Sstephh    { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)->
5627532SSean.Ye@Sun.COM    ereport.cpu.amd.dc.data_eccm@chip/core/strand,
5637532SSean.Ye@Sun.COM    ereport.cpu.amd.dc.data_ecc1_uc@chip/core/strand;
5641414Scindi
5651414Scindi/* 								#DCT_PAR#
5661414Scindi * A tag array parity fault in an D cache can cause:
5671414Scindi *
5681414Scindi *  - tag_par : reported by dc on this cpu
5691414Scindi */
5701414Scindi
5717532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.tag_par@chip/core/strand{within(5s)};
5727532SSean.Ye@Sun.COMevent fault.cpu.amd.dcachetag@chip/core/strand;
5731414Scindi
5747532SSean.Ye@Sun.COMprop fault.cpu.amd.dcachetag@chip/core/strand (1)->
5757532SSean.Ye@Sun.COM    ereport.cpu.amd.dc.tag_par@chip/core/strand;
5761414Scindi
5771414Scindi/* 								#DCT_SNOOP#
5781414Scindi * A snoop tag array parity fault in an D cache can cause:
5791414Scindi *
5801414Scindi *  - stag_par : reported by dc on this cpu
5811414Scindi */
5821414Scindi
5837532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.stag_par@chip/core/strand{within(5s)};
5847532SSean.Ye@Sun.COMevent fault.cpu.amd.dcachestag@chip/core/strand;
5851414Scindi
5867532SSean.Ye@Sun.COMprop fault.cpu.amd.dcachestag@chip/core/strand (1)->
5877532SSean.Ye@Sun.COM    ereport.cpu.amd.dc.stag_par@chip/core/strand;
5881414Scindi
5891414Scindi/* 								#DCTLB_1#
5901414Scindi * An l1tlb parity fault in an D cache can cause:
5911414Scindi *
5921414Scindi *  - l1tlb_par : reported by dc on this cpu
5931414Scindi */
5941414Scindi
5957532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.l1tlb_par@chip/core/strand{within(5s)};
5967532SSean.Ye@Sun.COMevent fault.cpu.amd.l1dtlb@chip/core/strand;
5971414Scindi
5987532SSean.Ye@Sun.COMprop fault.cpu.amd.l1dtlb@chip/core/strand (1)->
5997532SSean.Ye@Sun.COM    ereport.cpu.amd.dc.l1tlb_par@chip/core/strand;
6001414Scindi
6011414Scindi/* 								#DCTLB_2#
6021414Scindi * An l2tlb parity fault in an D cache can cause:
6031414Scindi *
6041414Scindi *  - l2tlb_par : reported by dc on this cpu
6051414Scindi */
6061414Scindi
6077532SSean.Ye@Sun.COMevent ereport.cpu.amd.dc.l2tlb_par@chip/core/strand{within(5s)};
6087532SSean.Ye@Sun.COMevent fault.cpu.amd.l2dtlb@chip/core/strand;
6091414Scindi
6107532SSean.Ye@Sun.COMprop fault.cpu.amd.l2dtlb@chip/core/strand (1)->
6117532SSean.Ye@Sun.COM    ereport.cpu.amd.dc.l2tlb_par@chip/core/strand;
6121414Scindi
6132869Sgavinm/*								#MISC#
6141414Scindi * Ereports that should not normally happen and which we will discard
6151414Scindi * without diagnosis if they do.  These fall into a few categories:
6161414Scindi *
6171414Scindi *	- the corresponding detector is not enabled, typically because
6181414Scindi *	  detection/handling of the event is taking place elsewhere
6191414Scindi *	  (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk)
6201414Scindi *	- the event is associated with a sync flood so even if the detector is
6211414Scindi *	  enabled we will never handle the event and generate an ereport *and*
6221414Scindi *	  even if the ereport did arrive we could perform no useful diagnosis
6231414Scindi *	  e.g., the NB can be configured for sync flood on nb.mem_eccm
6241414Scindi *	  but we don't choose to discard that ereport here since we could have
6251414Scindi *	  made a useful diagnosis from it had it been delivered
6261414Scindi *	  (nb.ht_sync, nb.ht_crc)
6271414Scindi *	- events that will be accompanied by an immediate panic and
6281414Scindi *	  delivery of the ereport during subsequent reboot but from
6291414Scindi *	  which no useful diagnosis can be made. (nb.rmw, nb.wdog)
6301414Scindi *
6311414Scindi * Ereports for all of these can be generated by error simulation and
6321414Scindi * injection.  We will perform a null diagnosos of all these ereports in order
6331414Scindi * to avoid "no subscription" complaints during test harness runs.
6341414Scindi */
6351414Scindi
6367532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.ma@strand{within(5s)};
6377532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.ta@strand{within(5s)};
6387532SSean.Ye@Sun.COMevent ereport.cpu.amd.ls.s_rde@strand{within(5s)};
6397532SSean.Ye@Sun.COMevent ereport.cpu.amd.ic.rdde@strand{within(5s)};
6407532SSean.Ye@Sun.COMevent ereport.cpu.amd.bu.s_rde@strand{within(5s)};
6417532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.gart_walk@strand{within(5s)};
6427532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.ht_sync@strand{within(5s)};
6437532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.ht_crc@strand{within(5s)};
6447532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.rmw@strand{within(5s)};
6457532SSean.Ye@Sun.COMevent ereport.cpu.amd.nb.wdog@strand{within(5s)};
6467532SSean.Ye@Sun.COMevent ereport.cpu.amd.unknown@strand{within(5s)};
6471414Scindi
6487532SSean.Ye@Sun.COMevent upset.null_diag@strand;
6491414Scindi
6507532SSean.Ye@Sun.COMprop upset.null_diag@strand (1)->
6517532SSean.Ye@Sun.COM    ereport.cpu.amd.nb.ma@strand,
6527532SSean.Ye@Sun.COM    ereport.cpu.amd.nb.ta@strand,
6537532SSean.Ye@Sun.COM    ereport.cpu.amd.ls.s_rde@strand,
6547532SSean.Ye@Sun.COM    ereport.cpu.amd.ic.rdde@strand,
6557532SSean.Ye@Sun.COM    ereport.cpu.amd.bu.s_rde@strand,
6567532SSean.Ye@Sun.COM    ereport.cpu.amd.nb.gart_walk@strand,
6577532SSean.Ye@Sun.COM    ereport.cpu.amd.nb.ht_sync@strand,
6587532SSean.Ye@Sun.COM    ereport.cpu.amd.nb.ht_crc@strand,
6597532SSean.Ye@Sun.COM    ereport.cpu.amd.nb.rmw@strand,
6607532SSean.Ye@Sun.COM    ereport.cpu.amd.nb.wdog@strand,
6617532SSean.Ye@Sun.COM    ereport.cpu.amd.unknown@strand;
662