14198Seschrock/* 24198Seschrock * CDDL HEADER START 34198Seschrock * 44198Seschrock * The contents of this file are subject to the terms of the 54198Seschrock * Common Development and Distribution License (the "License"). 64198Seschrock * You may not use this file except in compliance with the License. 74198Seschrock * 84198Seschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 94198Seschrock * or http://www.opensolaris.org/os/licensing. 104198Seschrock * See the License for the specific language governing permissions 114198Seschrock * and limitations under the License. 124198Seschrock * 134198Seschrock * When distributing Covered Code, include this CDDL HEADER in each 144198Seschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 154198Seschrock * If applicable, add the following below this CDDL HEADER, with the 164198Seschrock * fields enclosed by brackets "[]" replaced with your own identifying 174198Seschrock * information: Portions Copyright [yyyy] [name of copyright owner] 184198Seschrock * 194198Seschrock * CDDL HEADER END 204198Seschrock */ 214198Seschrock/* 22*12618SStephen.Hanson@Sun.COM * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 234198Seschrock */ 244198Seschrock 254198Seschrock#pragma dictionary "DISK" 264198Seschrock 2711031SDavid.Zhang@Sun.COM#define P disk 284198Seschrock 294198Seschrockfru P; 304198Seschrockasru P; 314198Seschrock 324198Seschrock/* 3311031SDavid.Zhang@Sun.COM * Over all comments for this file: 3411031SDavid.Zhang@Sun.COM * <disk-as-detector> The disk-as-detector DE provides the mapping between 357570SDavid.Zhang@Sun.COM * ereports generated by a kernel disk driver sd(7D) and resulting faults. 367570SDavid.Zhang@Sun.COM */ 377570SDavid.Zhang@Sun.COM 387570SDavid.Zhang@Sun.COM/* 3911031SDavid.Zhang@Sun.COM * SERD engine for media error fault propagation: 4011031SDavid.Zhang@Sun.COM * 4111031SDavid.Zhang@Sun.COM * This strategy is designed to give a file system, like ZFS, the 4211031SDavid.Zhang@Sun.COM * ability to attempt data recovery/relocation without faulting a disk. 4311031SDavid.Zhang@Sun.COM * This implementation depends on a file system retry to the same lba 4411031SDavid.Zhang@Sun.COM * to trigger a fault when recovery/relocation is not possible. 4511031SDavid.Zhang@Sun.COM * 4611031SDavid.Zhang@Sun.COM * We let the engine propagate one error only once every 1 minute and then if we 4711031SDavid.Zhang@Sun.COM * still get 2 or more * errors within 24 hours for the same LBA, there is a fault. 4811031SDavid.Zhang@Sun.COM */ 4911031SDavid.Zhang@Sun.COMengine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h; 5011031SDavid.Zhang@Sun.COM 5111031SDavid.Zhang@Sun.COM/* 527570SDavid.Zhang@Sun.COM * disk-as-detector: fault events. 537570SDavid.Zhang@Sun.COM */ 547570SDavid.Zhang@Sun.COMevent fault.io.scsi.cmd.disk.dev.rqs.derr@P; 5511031SDavid.Zhang@Sun.COMevent fault.io.scsi.cmd.disk.dev.rqs.merr@P, 5611031SDavid.Zhang@Sun.COM engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P; 5711031SDavid.Zhang@Sun.COM 587570SDavid.Zhang@Sun.COM/* 597570SDavid.Zhang@Sun.COM * The uderr fault will be defined at some future time. 607570SDavid.Zhang@Sun.COM * event fault.io.scsi.cmd.disk.dev.uderr@P; 617570SDavid.Zhang@Sun.COM */ 627570SDavid.Zhang@Sun.COM 637570SDavid.Zhang@Sun.COM/* 647570SDavid.Zhang@Sun.COM * disk-as-detector: upset events. 657570SDavid.Zhang@Sun.COM * NOTE: For now we define an upset to implement discard. 667570SDavid.Zhang@Sun.COM */ 677570SDavid.Zhang@Sun.COMevent upset.io.scsi.cmd.disk.dev.rqs.derr@P; 687570SDavid.Zhang@Sun.COMevent upset.io.scsi.cmd.disk.dev.rqs.merr@P; 697570SDavid.Zhang@Sun.COMevent upset.io.scsi.cmd.disk.dev.uderr@P; 707570SDavid.Zhang@Sun.COMevent upset.io.scsi.cmd.disk.dev.serr@P; 717570SDavid.Zhang@Sun.COMevent upset.io.scsi.cmd.disk.tran@P; 727570SDavid.Zhang@Sun.COMevent upset.io.scsi.cmd.disk.recovered@P; 737570SDavid.Zhang@Sun.COM 747570SDavid.Zhang@Sun.COM/* 757570SDavid.Zhang@Sun.COM * disk-as-detector: ereports from the kernel. 767570SDavid.Zhang@Sun.COM * 777570SDavid.Zhang@Sun.COM * We don't know the topology for all scsi disks, but the kernel will always 787570SDavid.Zhang@Sun.COM * generate ereport telemetry assuming that we do. We define these ereports 797570SDavid.Zhang@Sun.COM * with 'discard_if_config_unknown=1', which permits ereports against things 807570SDavid.Zhang@Sun.COM * with unknown topology to be silently discarded. The ereport data is logged 817570SDavid.Zhang@Sun.COM * in either case, and can be viewed via 'fmdump -eV'. 827570SDavid.Zhang@Sun.COM */ 837570SDavid.Zhang@Sun.COMevent ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1; 847570SDavid.Zhang@Sun.COMevent ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1; 857570SDavid.Zhang@Sun.COMevent ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1; 867570SDavid.Zhang@Sun.COMevent ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1; 877570SDavid.Zhang@Sun.COMevent ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1; 887570SDavid.Zhang@Sun.COMevent ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1; 897570SDavid.Zhang@Sun.COM 907570SDavid.Zhang@Sun.COM/* 917570SDavid.Zhang@Sun.COM * For some ereports we let the 'driver-assessment', communicated as part of 927570SDavid.Zhang@Sun.COM * the ereport payload, determine fault .vs. upset via propagation constraints. 937570SDavid.Zhang@Sun.COM */ 947570SDavid.Zhang@Sun.COM#define DRIVER_ASSESSMENT_FATAL \ 957570SDavid.Zhang@Sun.COM (payloadprop_contains("driver-assessment", "fatal")) 967570SDavid.Zhang@Sun.COM#define DRIVER_ASSESSMENT_NONFATAL (!DRIVER_ASSESSMENT_FATAL) 977570SDavid.Zhang@Sun.COM 987570SDavid.Zhang@Sun.COM/* 997570SDavid.Zhang@Sun.COM * disk-as-detector: propagations from faults(based on 1007570SDavid.Zhang@Sun.COM * DRIVER_ASSESSMENT_FATAL). 1017570SDavid.Zhang@Sun.COM * We need to set additional fault payloads to indicate fault details. 1027570SDavid.Zhang@Sun.COM * The payload we may need are listed as following: 1037570SDavid.Zhang@Sun.COM * fault.io.scsi.cmd.disk.dev.rqs.derr 1047570SDavid.Zhang@Sun.COM * op_code, key, asc, ascq 1057570SDavid.Zhang@Sun.COM * fault.io.scsi.cmd.disk.dev.rqs.merr 1067570SDavid.Zhang@Sun.COM * op_code, key, asc, ascq, lba 1077570SDavid.Zhang@Sun.COM */ 1087570SDavid.Zhang@Sun.COMprop fault.io.scsi.cmd.disk.dev.rqs.derr@P-> 1097570SDavid.Zhang@Sun.COM ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL && 1107570SDavid.Zhang@Sun.COM setpayloadprop("key", payloadprop("key")) && 1117570SDavid.Zhang@Sun.COM setpayloadprop("asc", payloadprop("asc")) && 1127570SDavid.Zhang@Sun.COM setpayloadprop("ascq", payloadprop("ascq"))}; 1137570SDavid.Zhang@Sun.COM 11411031SDavid.Zhang@Sun.COM/* 11511031SDavid.Zhang@Sun.COM * Utilize setserdsuffix with specific LBA, 11611031SDavid.Zhang@Sun.COM * the serd engine would only trigger if the fault recurred on the same LBA 11711031SDavid.Zhang@Sun.COM */ 1187570SDavid.Zhang@Sun.COMprop fault.io.scsi.cmd.disk.dev.rqs.merr@P-> 1197570SDavid.Zhang@Sun.COM ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL && 12011031SDavid.Zhang@Sun.COM setserdsuffix(payloadprop("lba")) && 1217570SDavid.Zhang@Sun.COM setpayloadprop("key", payloadprop("key")) && 1227570SDavid.Zhang@Sun.COM setpayloadprop("asc", payloadprop("asc")) && 1237570SDavid.Zhang@Sun.COM setpayloadprop("ascq", payloadprop("ascq")) && 1247570SDavid.Zhang@Sun.COM setpayloadprop("lba", payloadprop("lba"))}; 1257570SDavid.Zhang@Sun.COM 1267570SDavid.Zhang@Sun.COM/* 12711031SDavid.Zhang@Sun.COM * NOTE: this propagation uses the "may" propagation of eversholt. 12811031SDavid.Zhang@Sun.COM * The ereport need never exist. It's just a way of making 12911031SDavid.Zhang@Sun.COM * the diagnosis wait for the within time on that ereport 13011031SDavid.Zhang@Sun.COM * to complete. Once it has completed the diagnosis continues 13111031SDavid.Zhang@Sun.COM * even though the dummy ereport didn't occur. 13211031SDavid.Zhang@Sun.COM */ 13311031SDavid.Zhang@Sun.COMevent ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)}; 13411031SDavid.Zhang@Sun.COMprop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) -> 13511031SDavid.Zhang@Sun.COM ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P; 13611031SDavid.Zhang@Sun.COM 13711031SDavid.Zhang@Sun.COM/* 1387570SDavid.Zhang@Sun.COM * The uderr fault will be propagated at some future time. 1397570SDavid.Zhang@Sun.COM * prop fault.io.scsi.cmd.disk.dev.uderr@P-> 1407570SDavid.Zhang@Sun.COM * ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL }; 1417570SDavid.Zhang@Sun.COM */ 1427570SDavid.Zhang@Sun.COM 1437570SDavid.Zhang@Sun.COM/* 1447570SDavid.Zhang@Sun.COM * disk-as-detector: propagations from upsets(based on 1457570SDavid.Zhang@Sun.COM * DRIVER_ASSESSMENT_NONFATAL). 1467570SDavid.Zhang@Sun.COM */ 1477570SDavid.Zhang@Sun.COMprop upset.io.scsi.cmd.disk.dev.rqs.derr@P-> 1487570SDavid.Zhang@Sun.COM ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL }; 1497570SDavid.Zhang@Sun.COM 1507570SDavid.Zhang@Sun.COMprop upset.io.scsi.cmd.disk.dev.rqs.merr@P-> 1517570SDavid.Zhang@Sun.COM ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL }; 1527570SDavid.Zhang@Sun.COM 1537570SDavid.Zhang@Sun.COM/* 1547570SDavid.Zhang@Sun.COM * disk-as-detector: propagations from upsets(independent of 1557570SDavid.Zhang@Sun.COM * driver-assessment) 1567570SDavid.Zhang@Sun.COM */ 1577570SDavid.Zhang@Sun.COM 1587570SDavid.Zhang@Sun.COMprop upset.io.scsi.cmd.disk.dev.serr@P-> 1597570SDavid.Zhang@Sun.COM ereport.io.scsi.cmd.disk.dev.serr@P; 1607570SDavid.Zhang@Sun.COM 1617570SDavid.Zhang@Sun.COMprop upset.io.scsi.cmd.disk.dev.uderr@P-> 1627570SDavid.Zhang@Sun.COM ereport.io.scsi.cmd.disk.dev.uderr@P; 1637570SDavid.Zhang@Sun.COM 1647570SDavid.Zhang@Sun.COMprop upset.io.scsi.cmd.disk.recovered@P-> 1657570SDavid.Zhang@Sun.COM ereport.io.scsi.cmd.disk.recovered@P; 1667570SDavid.Zhang@Sun.COM 1677570SDavid.Zhang@Sun.COMprop upset.io.scsi.cmd.disk.tran@P-> 1687570SDavid.Zhang@Sun.COM ereport.io.scsi.cmd.disk.tran@P; 1697570SDavid.Zhang@Sun.COM 1707570SDavid.Zhang@Sun.COM/* 1717570SDavid.Zhang@Sun.COM * -------------------------------------- 1727570SDavid.Zhang@Sun.COM * The remainder of this file contains rules associated with the operation of 1737570SDavid.Zhang@Sun.COM * cmd/fm/modules/common/disk-monitor/disk_monitor.c code. 1747570SDavid.Zhang@Sun.COM * 1757570SDavid.Zhang@Sun.COM * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events 1767570SDavid.Zhang@Sun.COM * generated by the disk-transport fmd module, and the resulting faults. 1777570SDavid.Zhang@Sun.COM */ 1787570SDavid.Zhang@Sun.COM 1797570SDavid.Zhang@Sun.COM/* 1804198Seschrock * Fault events. 1814198Seschrock */ 1824198Seschrockevent fault.io.disk.over-temperature@P, 1834198Seschrock FITrate=10, FRU=P, ASRU=P; 1844198Seschrockevent fault.io.disk.predictive-failure@P, FITrate=10, 1854198Seschrock FITrate=10, FRU=P, ASRU=P; 1864198Seschrockevent fault.io.disk.self-test-failure@P, FITrate=10, 1874198Seschrock FITrate=10, FRU=P, ASRU=P; 1884198Seschrock 1894198Seschrock/* 1904198Seschrock * ereports. 1914198Seschrock */ 1924198Seschrockevent ereport.io.scsi.disk.over-temperature@P; 1934198Seschrockevent ereport.io.scsi.disk.predictive-failure@P; 1944198Seschrockevent ereport.io.scsi.disk.self-test-failure@P; 1954198Seschrock 1964198Seschrock/* 1974198Seschrock * Propagations. 1984198Seschrock */ 1994198Seschrockprop fault.io.disk.over-temperature@P -> 2004198Seschrock ereport.io.scsi.disk.over-temperature@P; 2014198Seschrock 2024198Seschrockprop fault.io.disk.self-test-failure@P -> 2034198Seschrock ereport.io.scsi.disk.self-test-failure@P; 2044198Seschrock 2054198Seschrockprop fault.io.disk.predictive-failure@P -> 206*12618SStephen.Hanson@Sun.COM ereport.io.scsi.disk.predictive-failure@P { 207*12618SStephen.Hanson@Sun.COM setpayloadprop("asc", payloadprop("additional-sense-code")) && 208*12618SStephen.Hanson@Sun.COM setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) }; 209