xref: /onnv-gate/usr/src/cmd/fm/eversholt/files/i386/i86pc/gcpu_amd.esc (revision 9078:5316b078e4eb)
15254Sgavinm/*
25254Sgavinm * CDDL HEADER START
35254Sgavinm *
45254Sgavinm * The contents of this file are subject to the terms of the
55254Sgavinm * Common Development and Distribution License (the "License").
65254Sgavinm * You may not use this file except in compliance with the License.
75254Sgavinm *
85254Sgavinm * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
95254Sgavinm * or http://www.opensolaris.org/os/licensing.
105254Sgavinm * See the License for the specific language governing permissions
115254Sgavinm * and limitations under the License.
125254Sgavinm *
135254Sgavinm * When distributing Covered Code, include this CDDL HEADER in each
145254Sgavinm * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
155254Sgavinm * If applicable, add the following below this CDDL HEADER, with the
165254Sgavinm * fields enclosed by brackets "[]" replaced with your own identifying
175254Sgavinm * information: Portions Copyright [yyyy] [name of copyright owner]
185254Sgavinm *
195254Sgavinm * CDDL HEADER END
205254Sgavinm */
215254Sgavinm
225254Sgavinm/*
23*9078SStephen.Hanson@Sun.COM * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
245254Sgavinm * Use is subject to license terms.
255254Sgavinm */
265254Sgavinm
275254Sgavinm/*
285254Sgavinm * Eversholt rules for generic AMD with on-chip memory-controller(s), as seen
295254Sgavinm * in AMD family 0xf and 0x10.
305254Sgavinm *
315254Sgavinm * In the absence of any model-specific support, any memory errors that
325254Sgavinm * are observed via MCA (typically through an on-chip memory-controller)
335254Sgavinm * will surface as ereport.cpu.generic-x86.bus_interconnect_memory[_uc]
345254Sgavinm * ereports and are diagnosed via generic rules in gcpu.esc.
355254Sgavinm *
365254Sgavinm * If full model-specific support is available, including full NorthBridge
375254Sgavinm * support, then memory ereports will surface in a more-specific subclass
385254Sgavinm * such as ereport.cpu.amd.mem_ce; these are diagnosed in amd64.esc.
395254Sgavinm *
405254Sgavinm * In the case where some "vendor generic" support is present, memory errors
415254Sgavinm * are reported as ereport.cpu.generic-x86.mem_{ce,ue} and include a
425254Sgavinm * syndrome and syndrome-type, and usually also a resource FMRI to identify
435254Sgavinm * the affected resource.  In the AMD case a resource FMRI is included for
445254Sgavinm * those chip versions that include an Online Spare Control register; this
455254Sgavinm * register provides counts of ECC errors seen per channel and chip-select
465254Sgavinm * on a NorthBridge node.  The resource FMRI has form
475254Sgavinm * 	hc:///motherboard/chip/memory-controller/dram-channel/chip-select
485254Sgavinm * in these cases.
495254Sgavinm */
505254Sgavinm
515254Sgavinm#pragma dictionary "GMCA"
525254Sgavinm
535254Sgavinm/*
545254Sgavinm * The number of pages that must be faulted on a chip-select for repeated
555254Sgavinm * correctable errors before we will consider one of the component dimms
565254Sgavinm * faulty.
575254Sgavinm */
585254Sgavinm#define	CS_DIMMSB_THRESH	64
595254Sgavinm
605254Sgavinm/*
615254Sgavinm * The maximum number of pages we will diagnose as faulty on any one
625254Sgavinm * chip-select (must be at least CS_PAGEFLT_THRESH).  If a chip-select
635254Sgavinm * has a fault that will affect zillions of pages this limit stops us
645254Sgavinm * diagnosing excessive numbers of page faults.
655254Sgavinm */
665254Sgavinm#define	CS_PAGEFLT_MAX		(2 * CS_DIMMSB_THRESH)
675254Sgavinm
685254Sgavinm/*
695254Sgavinm * SERD paramters for individual page faults.  When more than PAGE_SB_COUNT
705254Sgavinm * correctable ereports are experienced on a single chip-select within
715254Sgavinm * PAGE_SB_TIME the engine will fire and we will fault the most recent
725254Sgavinm * page.
735254Sgavinm */
745254Sgavinm#define	PAGE_SB_COUNT		3
755254Sgavinm#define	PAGE_SB_TIME		24h
765254Sgavinm
775254Sgavinm#define	CSPATH	chip/memory-controller/dram-channel/chip-select
785254Sgavinm
795254Sgavinm/*
805254Sgavinm * ADDR_VALID is true if the ereport payload includes IA32_MCi_ADDR.
815254Sgavinm */
825254Sgavinm#define	ADDR_VALID (payloadprop_defined("IA32_MCi_ADDR"))
835254Sgavinm
845254Sgavinm/*
855254Sgavinm * CONTAINS_CS is true if the resource nvlist array exists and one of its
865254Sgavinm * members matches the chip-select path.  This is used to constrain
875254Sgavinm * propogations to those for which a resource element matches the
885254Sgavinm * chip-select path of the propogation.  This is necessary because the
895254Sgavinm * detector element of memory ereports is a cpu and not the chip-select itself.
905254Sgavinm */
915254Sgavinm#define	CONTAINS_CS (payloadprop_contains("resource", asru(CSPATH)))
925254Sgavinm
935254Sgavinm#define	SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR")))
945254Sgavinm/* Generic memory ereports. */
957532SSean.Ye@Sun.COMevent ereport.cpu.generic-x86.mem_ce@chip/core/strand { within(1s) };
967532SSean.Ye@Sun.COMevent ereport.cpu.generic-x86.mem_ue@chip/core/strand { within(1s) };
975254Sgavinm
985254Sgavinm/*
997197Sstephh *	 ========= Propogations for correctable memory faults ==========
1005254Sgavinm *	|								|
1015254Sgavinm *	| Discard mem_ce with no resource in the ereport payload.	|
1025254Sgavinm *	| Discard mem_ce with no address info - we can't fault the	|
1035254Sgavinm *	| corresponding page without it.				|
1045254Sgavinm *	|								|
1055254Sgavinm *	| For a mem_ce ereport detected by a given chip/cpu (as per	|
1065254Sgavinm *	| the payload detector info) whose resource payload member	|
1075254Sgavinm *	| includes a chip/memory-controller/dram-channel/chip-select	|
1087197Sstephh *	| (CSPATH) for the same chip number, diagnose to an fault event	|
1095254Sgavinm *	| associated with a per-CSPATH SERD engine as long as we are	|
1105254Sgavinm *	| below the page fault limit for this CSPATH (defined below);	|
1115254Sgavinm *	| if we are over that limit then discard the event since we	|
1125254Sgavinm *	| will already have faulted a dimm and there is no point in	|
1135254Sgavinm *	| continuing to diagnose endless page faults from a dimm with	|
1145254Sgavinm *	| something like a pin failure.					|
1155254Sgavinm *	|								|
1165254Sgavinm *	| When the per-CSPATH SERD engine fires we fault the page	|
1175254Sgavinm *	| containing the address included in the ereport that caused	|
1185254Sgavinm *	| the trip, and increment a per-CSPATH counter to count page	|
1195254Sgavinm *	| faults on that chip-select from repeated correctable errors.	|
1205254Sgavinm *	|								|
1217197Sstephh *	| A dimm_ce fault is diagnosed when we have faulted an		|
1227197Sstephh *	| excessive number of page_ce faults on a chip-select - more	|
1237197Sstephh *	| than CE_DIMMSB_THRESH.					|
1245254Sgavinm *	|===============================================================|
1255254Sgavinm */
1265254Sgavinm
1277197Sstephh#define	CS_PGFLT_LIMIT_REACHED (count(stat.cepgflt@CSPATH) > CS_PAGEFLT_MAX)
1287197Sstephh#define	CS_DIMMSB_THRESH_REACHED \
1297197Sstephh	(count(stat.cepgflt@CSPATH) >= CS_DIMMSB_THRESH)
1305254Sgavinm
1317197Sstephhengine stat.cepgflt@CSPATH;
1327197Sstephhengine serd.memory.generic-x86.page_ce@CSPATH, N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
1335254Sgavinmevent fault.memory.generic-x86.page_ce@CSPATH,
1347197Sstephh    message=0, response=0,		/* do not message individual pageflts */
1355254Sgavinm    count=stat.cepgflt@CSPATH,		/* increment on pageflt diagnosis */
1365254Sgavinm    engine=serd.memory.generic-x86.page_ce@CSPATH;
1377197Sstephhengine serd.memory.generic-x86.dimm_ce@CSPATH, N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
1387197Sstephhevent fault.memory.generic-x86.dimm_ce@CSPATH,
1397197Sstephh    engine=serd.memory.generic-x86.dimm_ce@CSPATH;
1405254Sgavinm
1417197Sstephhprop fault.memory.generic-x86.page_ce@CSPATH
1427197Sstephh    { ADDR_VALID && CONTAINS_CS && !CS_PGFLT_LIMIT_REACHED && SET_ADDR } (1)->
143*9078SStephen.Hanson@Sun.COM    ereport.cpu.generic-x86.mem_ce@chip/core<>/strand<>;
1445254Sgavinm
1457197Sstephhprop fault.memory.generic-x86.dimm_ce@CSPATH
1467197Sstephh    { ADDR_VALID && CONTAINS_CS && CS_DIMMSB_THRESH_REACHED } (1)->
147*9078SStephen.Hanson@Sun.COM    ereport.cpu.generic-x86.mem_ce@chip/core<>/strand<>;
1485254Sgavinm
1497532SSean.Ye@Sun.COMevent upset.memory.generic-x86.discard@chip/core/strand;
1507532SSean.Ye@Sun.COMprop upset.memory.generic-x86.discard@chip/core/strand
1515254Sgavinm    { !payloadprop_defined("resource") || !ADDR_VALID } (1)->
1527532SSean.Ye@Sun.COM    ereport.cpu.generic-x86.mem_ce@chip/core/strand;
1535254Sgavinm
1545254Sgavinm/*
1557197Sstephh *	 ========= Propogations for uncorrectable page faults ==========
1565254Sgavinm *	|								|
1577197Sstephh *	| A UE produces an immediate page fault.
1585254Sgavinm *	|===============================================================|
1595254Sgavinm */
1605254Sgavinm
1617197Sstephhevent fault.memory.generic-x86.page_ue@CSPATH,
1627197Sstephh    message=0, response=0,		/* do not message individual pageflts */
1637197Sstephh    count=stat.cepgflt@CSPATH;		/* increment on pageflt diagnosis */
1647197Sstephhevent fault.memory.generic-x86.dimm_ue@CSPATH;
1655254Sgavinm
1667197Sstephhprop fault.memory.generic-x86.page_ue@CSPATH
1677197Sstephh    { ADDR_VALID && CONTAINS_CS && SET_ADDR } (1)->
168*9078SStephen.Hanson@Sun.COM    ereport.cpu.generic-x86.mem_ue@chip/core<>/strand<>;
1695254Sgavinm
1707197Sstephhprop fault.memory.generic-x86.dimm_ue@CSPATH
1717197Sstephh    { ADDR_VALID && CONTAINS_CS } (1)->
172*9078SStephen.Hanson@Sun.COM    ereport.cpu.generic-x86.mem_ue@chip/core<>/strand<>;
1735254Sgavinm
1745254Sgavinmevent upset.memory.generic-x86.discard3@CSPATH;
1755254Sgavinmprop upset.memory.generic-x86.discard3@CSPATH
1765254Sgavinm    { !payloadprop_defined("resource") || !ADDR_VALID } (1)->
177*9078SStephen.Hanson@Sun.COM    ereport.cpu.generic-x86.mem_ue@chip/core<>/strand<>;
1785254Sgavinm
1795254Sgavinm/*
1805254Sgavinm *	 ========= Propogations for GART Table Walk Errors =============
1815254Sgavinm *	|								|
1825254Sgavinm *	| These are usually due to software mis-programming of the GART	|
1835254Sgavinm *	| TLB rather than from hardware errors.  It would be incorrect	|
1845254Sgavinm *	| to fault and potentially offline a cpu in response to these	|
1855254Sgavinm *	| so they have their own fault class to facilitate us ignoring	|
1865254Sgavinm *	| them.								|
1875254Sgavinm *	|===============================================================|
1885254Sgavinm */
1895254Sgavinm
1907532SSean.Ye@Sun.COMevent ereport.cpu.generic-x86.gart_tbl_walk@chip/core/strand { within(1s) };
1917532SSean.Ye@Sun.COMevent upset.cpu.generic-x86.gart_tbl_walk@chip/core/strand;
1925254Sgavinm
1937532SSean.Ye@Sun.COMprop upset.cpu.generic-x86.gart_tbl_walk@chip/core/strand (1)->
1947532SSean.Ye@Sun.COM    ereport.cpu.generic-x86.gart_tbl_walk@chip/core/strand;
195