15254Sgavinm/* 25254Sgavinm * CDDL HEADER START 35254Sgavinm * 45254Sgavinm * The contents of this file are subject to the terms of the 55254Sgavinm * Common Development and Distribution License (the "License"). 65254Sgavinm * You may not use this file except in compliance with the License. 75254Sgavinm * 85254Sgavinm * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 95254Sgavinm * or http://www.opensolaris.org/os/licensing. 105254Sgavinm * See the License for the specific language governing permissions 115254Sgavinm * and limitations under the License. 125254Sgavinm * 135254Sgavinm * When distributing Covered Code, include this CDDL HEADER in each 145254Sgavinm * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 155254Sgavinm * If applicable, add the following below this CDDL HEADER, with the 165254Sgavinm * fields enclosed by brackets "[]" replaced with your own identifying 175254Sgavinm * information: Portions Copyright [yyyy] [name of copyright owner] 185254Sgavinm * 195254Sgavinm * CDDL HEADER END 205254Sgavinm */ 215254Sgavinm 225254Sgavinm/* 23*9078SStephen.Hanson@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 245254Sgavinm * Use is subject to license terms. 255254Sgavinm */ 265254Sgavinm 275254Sgavinm/* 285254Sgavinm * Eversholt rules for generic AMD with on-chip memory-controller(s), as seen 295254Sgavinm * in AMD family 0xf and 0x10. 305254Sgavinm * 315254Sgavinm * In the absence of any model-specific support, any memory errors that 325254Sgavinm * are observed via MCA (typically through an on-chip memory-controller) 335254Sgavinm * will surface as ereport.cpu.generic-x86.bus_interconnect_memory[_uc] 345254Sgavinm * ereports and are diagnosed via generic rules in gcpu.esc. 355254Sgavinm * 365254Sgavinm * If full model-specific support is available, including full NorthBridge 375254Sgavinm * support, then memory ereports will surface in a more-specific subclass 385254Sgavinm * such as ereport.cpu.amd.mem_ce; these are diagnosed in amd64.esc. 395254Sgavinm * 405254Sgavinm * In the case where some "vendor generic" support is present, memory errors 415254Sgavinm * are reported as ereport.cpu.generic-x86.mem_{ce,ue} and include a 425254Sgavinm * syndrome and syndrome-type, and usually also a resource FMRI to identify 435254Sgavinm * the affected resource. In the AMD case a resource FMRI is included for 445254Sgavinm * those chip versions that include an Online Spare Control register; this 455254Sgavinm * register provides counts of ECC errors seen per channel and chip-select 465254Sgavinm * on a NorthBridge node. The resource FMRI has form 475254Sgavinm * hc:///motherboard/chip/memory-controller/dram-channel/chip-select 485254Sgavinm * in these cases. 495254Sgavinm */ 505254Sgavinm 515254Sgavinm#pragma dictionary "GMCA" 525254Sgavinm 535254Sgavinm/* 545254Sgavinm * The number of pages that must be faulted on a chip-select for repeated 555254Sgavinm * correctable errors before we will consider one of the component dimms 565254Sgavinm * faulty. 575254Sgavinm */ 585254Sgavinm#define CS_DIMMSB_THRESH 64 595254Sgavinm 605254Sgavinm/* 615254Sgavinm * The maximum number of pages we will diagnose as faulty on any one 625254Sgavinm * chip-select (must be at least CS_PAGEFLT_THRESH). If a chip-select 635254Sgavinm * has a fault that will affect zillions of pages this limit stops us 645254Sgavinm * diagnosing excessive numbers of page faults. 655254Sgavinm */ 665254Sgavinm#define CS_PAGEFLT_MAX (2 * CS_DIMMSB_THRESH) 675254Sgavinm 685254Sgavinm/* 695254Sgavinm * SERD paramters for individual page faults. When more than PAGE_SB_COUNT 705254Sgavinm * correctable ereports are experienced on a single chip-select within 715254Sgavinm * PAGE_SB_TIME the engine will fire and we will fault the most recent 725254Sgavinm * page. 735254Sgavinm */ 745254Sgavinm#define PAGE_SB_COUNT 3 755254Sgavinm#define PAGE_SB_TIME 24h 765254Sgavinm 775254Sgavinm#define CSPATH chip/memory-controller/dram-channel/chip-select 785254Sgavinm 795254Sgavinm/* 805254Sgavinm * ADDR_VALID is true if the ereport payload includes IA32_MCi_ADDR. 815254Sgavinm */ 825254Sgavinm#define ADDR_VALID (payloadprop_defined("IA32_MCi_ADDR")) 835254Sgavinm 845254Sgavinm/* 855254Sgavinm * CONTAINS_CS is true if the resource nvlist array exists and one of its 865254Sgavinm * members matches the chip-select path. This is used to constrain 875254Sgavinm * propogations to those for which a resource element matches the 885254Sgavinm * chip-select path of the propogation. This is necessary because the 895254Sgavinm * detector element of memory ereports is a cpu and not the chip-select itself. 905254Sgavinm */ 915254Sgavinm#define CONTAINS_CS (payloadprop_contains("resource", asru(CSPATH))) 925254Sgavinm 935254Sgavinm#define SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR"))) 945254Sgavinm/* Generic memory ereports. */ 957532SSean.Ye@Sun.COMevent ereport.cpu.generic-x86.mem_ce@chip/core/strand { within(1s) }; 967532SSean.Ye@Sun.COMevent ereport.cpu.generic-x86.mem_ue@chip/core/strand { within(1s) }; 975254Sgavinm 985254Sgavinm/* 997197Sstephh * ========= Propogations for correctable memory faults ========== 1005254Sgavinm * | | 1015254Sgavinm * | Discard mem_ce with no resource in the ereport payload. | 1025254Sgavinm * | Discard mem_ce with no address info - we can't fault the | 1035254Sgavinm * | corresponding page without it. | 1045254Sgavinm * | | 1055254Sgavinm * | For a mem_ce ereport detected by a given chip/cpu (as per | 1065254Sgavinm * | the payload detector info) whose resource payload member | 1075254Sgavinm * | includes a chip/memory-controller/dram-channel/chip-select | 1087197Sstephh * | (CSPATH) for the same chip number, diagnose to an fault event | 1095254Sgavinm * | associated with a per-CSPATH SERD engine as long as we are | 1105254Sgavinm * | below the page fault limit for this CSPATH (defined below); | 1115254Sgavinm * | if we are over that limit then discard the event since we | 1125254Sgavinm * | will already have faulted a dimm and there is no point in | 1135254Sgavinm * | continuing to diagnose endless page faults from a dimm with | 1145254Sgavinm * | something like a pin failure. | 1155254Sgavinm * | | 1165254Sgavinm * | When the per-CSPATH SERD engine fires we fault the page | 1175254Sgavinm * | containing the address included in the ereport that caused | 1185254Sgavinm * | the trip, and increment a per-CSPATH counter to count page | 1195254Sgavinm * | faults on that chip-select from repeated correctable errors. | 1205254Sgavinm * | | 1217197Sstephh * | A dimm_ce fault is diagnosed when we have faulted an | 1227197Sstephh * | excessive number of page_ce faults on a chip-select - more | 1237197Sstephh * | than CE_DIMMSB_THRESH. | 1245254Sgavinm * |===============================================================| 1255254Sgavinm */ 1265254Sgavinm 1277197Sstephh#define CS_PGFLT_LIMIT_REACHED (count(stat.cepgflt@CSPATH) > CS_PAGEFLT_MAX) 1287197Sstephh#define CS_DIMMSB_THRESH_REACHED \ 1297197Sstephh (count(stat.cepgflt@CSPATH) >= CS_DIMMSB_THRESH) 1305254Sgavinm 1317197Sstephhengine stat.cepgflt@CSPATH; 1327197Sstephhengine serd.memory.generic-x86.page_ce@CSPATH, N=PAGE_SB_COUNT, T=PAGE_SB_TIME; 1335254Sgavinmevent fault.memory.generic-x86.page_ce@CSPATH, 1347197Sstephh message=0, response=0, /* do not message individual pageflts */ 1355254Sgavinm count=stat.cepgflt@CSPATH, /* increment on pageflt diagnosis */ 1365254Sgavinm engine=serd.memory.generic-x86.page_ce@CSPATH; 1377197Sstephhengine serd.memory.generic-x86.dimm_ce@CSPATH, N=PAGE_SB_COUNT, T=PAGE_SB_TIME; 1387197Sstephhevent fault.memory.generic-x86.dimm_ce@CSPATH, 1397197Sstephh engine=serd.memory.generic-x86.dimm_ce@CSPATH; 1405254Sgavinm 1417197Sstephhprop fault.memory.generic-x86.page_ce@CSPATH 1427197Sstephh { ADDR_VALID && CONTAINS_CS && !CS_PGFLT_LIMIT_REACHED && SET_ADDR } (1)-> 143*9078SStephen.Hanson@Sun.COM ereport.cpu.generic-x86.mem_ce@chip/core<>/strand<>; 1445254Sgavinm 1457197Sstephhprop fault.memory.generic-x86.dimm_ce@CSPATH 1467197Sstephh { ADDR_VALID && CONTAINS_CS && CS_DIMMSB_THRESH_REACHED } (1)-> 147*9078SStephen.Hanson@Sun.COM ereport.cpu.generic-x86.mem_ce@chip/core<>/strand<>; 1485254Sgavinm 1497532SSean.Ye@Sun.COMevent upset.memory.generic-x86.discard@chip/core/strand; 1507532SSean.Ye@Sun.COMprop upset.memory.generic-x86.discard@chip/core/strand 1515254Sgavinm { !payloadprop_defined("resource") || !ADDR_VALID } (1)-> 1527532SSean.Ye@Sun.COM ereport.cpu.generic-x86.mem_ce@chip/core/strand; 1535254Sgavinm 1545254Sgavinm/* 1557197Sstephh * ========= Propogations for uncorrectable page faults ========== 1565254Sgavinm * | | 1577197Sstephh * | A UE produces an immediate page fault. 1585254Sgavinm * |===============================================================| 1595254Sgavinm */ 1605254Sgavinm 1617197Sstephhevent fault.memory.generic-x86.page_ue@CSPATH, 1627197Sstephh message=0, response=0, /* do not message individual pageflts */ 1637197Sstephh count=stat.cepgflt@CSPATH; /* increment on pageflt diagnosis */ 1647197Sstephhevent fault.memory.generic-x86.dimm_ue@CSPATH; 1655254Sgavinm 1667197Sstephhprop fault.memory.generic-x86.page_ue@CSPATH 1677197Sstephh { ADDR_VALID && CONTAINS_CS && SET_ADDR } (1)-> 168*9078SStephen.Hanson@Sun.COM ereport.cpu.generic-x86.mem_ue@chip/core<>/strand<>; 1695254Sgavinm 1707197Sstephhprop fault.memory.generic-x86.dimm_ue@CSPATH 1717197Sstephh { ADDR_VALID && CONTAINS_CS } (1)-> 172*9078SStephen.Hanson@Sun.COM ereport.cpu.generic-x86.mem_ue@chip/core<>/strand<>; 1735254Sgavinm 1745254Sgavinmevent upset.memory.generic-x86.discard3@CSPATH; 1755254Sgavinmprop upset.memory.generic-x86.discard3@CSPATH 1765254Sgavinm { !payloadprop_defined("resource") || !ADDR_VALID } (1)-> 177*9078SStephen.Hanson@Sun.COM ereport.cpu.generic-x86.mem_ue@chip/core<>/strand<>; 1785254Sgavinm 1795254Sgavinm/* 1805254Sgavinm * ========= Propogations for GART Table Walk Errors ============= 1815254Sgavinm * | | 1825254Sgavinm * | These are usually due to software mis-programming of the GART | 1835254Sgavinm * | TLB rather than from hardware errors. It would be incorrect | 1845254Sgavinm * | to fault and potentially offline a cpu in response to these | 1855254Sgavinm * | so they have their own fault class to facilitate us ignoring | 1865254Sgavinm * | them. | 1875254Sgavinm * |===============================================================| 1885254Sgavinm */ 1895254Sgavinm 1907532SSean.Ye@Sun.COMevent ereport.cpu.generic-x86.gart_tbl_walk@chip/core/strand { within(1s) }; 1917532SSean.Ye@Sun.COMevent upset.cpu.generic-x86.gart_tbl_walk@chip/core/strand; 1925254Sgavinm 1937532SSean.Ye@Sun.COMprop upset.cpu.generic-x86.gart_tbl_walk@chip/core/strand (1)-> 1947532SSean.Ye@Sun.COM ereport.cpu.generic-x86.gart_tbl_walk@chip/core/strand; 195