xref: /onnv-gate/usr/src/cmd/fm/eversholt/files/common/disk.esc (revision 12618:0e5eaf4bf546)
14198Seschrock/*
24198Seschrock * CDDL HEADER START
34198Seschrock *
44198Seschrock * The contents of this file are subject to the terms of the
54198Seschrock * Common Development and Distribution License (the "License").
64198Seschrock * You may not use this file except in compliance with the License.
74198Seschrock *
84198Seschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
94198Seschrock * or http://www.opensolaris.org/os/licensing.
104198Seschrock * See the License for the specific language governing permissions
114198Seschrock * and limitations under the License.
124198Seschrock *
134198Seschrock * When distributing Covered Code, include this CDDL HEADER in each
144198Seschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
154198Seschrock * If applicable, add the following below this CDDL HEADER, with the
164198Seschrock * fields enclosed by brackets "[]" replaced with your own identifying
174198Seschrock * information: Portions Copyright [yyyy] [name of copyright owner]
184198Seschrock *
194198Seschrock * CDDL HEADER END
204198Seschrock */
214198Seschrock/*
22*12618SStephen.Hanson@Sun.COM * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
234198Seschrock */
244198Seschrock
254198Seschrock#pragma dictionary "DISK"
264198Seschrock
2711031SDavid.Zhang@Sun.COM#define	P			disk
284198Seschrock
294198Seschrockfru P;
304198Seschrockasru P;
314198Seschrock
324198Seschrock/*
3311031SDavid.Zhang@Sun.COM * Over all comments for this file:
3411031SDavid.Zhang@Sun.COM * <disk-as-detector> The disk-as-detector DE provides the mapping between
357570SDavid.Zhang@Sun.COM * ereports generated by a kernel disk driver sd(7D) and resulting faults.
367570SDavid.Zhang@Sun.COM */
377570SDavid.Zhang@Sun.COM
387570SDavid.Zhang@Sun.COM/*
3911031SDavid.Zhang@Sun.COM * SERD engine for media error fault propagation:
4011031SDavid.Zhang@Sun.COM *
4111031SDavid.Zhang@Sun.COM * This strategy is designed to give a file system, like ZFS, the
4211031SDavid.Zhang@Sun.COM * ability to attempt data recovery/relocation without faulting a disk.
4311031SDavid.Zhang@Sun.COM * This implementation depends on a file system retry to the same lba
4411031SDavid.Zhang@Sun.COM * to trigger a fault when recovery/relocation is not possible.
4511031SDavid.Zhang@Sun.COM *
4611031SDavid.Zhang@Sun.COM * We let the engine propagate one error only once every 1 minute and then if we
4711031SDavid.Zhang@Sun.COM * still get 2 or more * errors within 24 hours for the same LBA, there is a fault.
4811031SDavid.Zhang@Sun.COM */
4911031SDavid.Zhang@Sun.COMengine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h;
5011031SDavid.Zhang@Sun.COM
5111031SDavid.Zhang@Sun.COM/*
527570SDavid.Zhang@Sun.COM * disk-as-detector: fault events.
537570SDavid.Zhang@Sun.COM */
547570SDavid.Zhang@Sun.COMevent fault.io.scsi.cmd.disk.dev.rqs.derr@P;
5511031SDavid.Zhang@Sun.COMevent fault.io.scsi.cmd.disk.dev.rqs.merr@P,
5611031SDavid.Zhang@Sun.COM    engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P;
5711031SDavid.Zhang@Sun.COM
587570SDavid.Zhang@Sun.COM/*
597570SDavid.Zhang@Sun.COM * The uderr fault will be defined at some future time.
607570SDavid.Zhang@Sun.COM * event fault.io.scsi.cmd.disk.dev.uderr@P;
617570SDavid.Zhang@Sun.COM */
627570SDavid.Zhang@Sun.COM
637570SDavid.Zhang@Sun.COM/*
647570SDavid.Zhang@Sun.COM * disk-as-detector: upset events.
657570SDavid.Zhang@Sun.COM * NOTE: For now we define an upset to implement discard.
667570SDavid.Zhang@Sun.COM */
677570SDavid.Zhang@Sun.COMevent upset.io.scsi.cmd.disk.dev.rqs.derr@P;
687570SDavid.Zhang@Sun.COMevent upset.io.scsi.cmd.disk.dev.rqs.merr@P;
697570SDavid.Zhang@Sun.COMevent upset.io.scsi.cmd.disk.dev.uderr@P;
707570SDavid.Zhang@Sun.COMevent upset.io.scsi.cmd.disk.dev.serr@P;
717570SDavid.Zhang@Sun.COMevent upset.io.scsi.cmd.disk.tran@P;
727570SDavid.Zhang@Sun.COMevent upset.io.scsi.cmd.disk.recovered@P;
737570SDavid.Zhang@Sun.COM
747570SDavid.Zhang@Sun.COM/*
757570SDavid.Zhang@Sun.COM * disk-as-detector: ereports from the kernel.
767570SDavid.Zhang@Sun.COM *
777570SDavid.Zhang@Sun.COM * We don't know the topology for all scsi disks, but the kernel will always
787570SDavid.Zhang@Sun.COM * generate ereport telemetry assuming that we do. We define these ereports
797570SDavid.Zhang@Sun.COM * with 'discard_if_config_unknown=1', which permits ereports against things
807570SDavid.Zhang@Sun.COM * with unknown topology to be silently discarded.  The ereport data is logged
817570SDavid.Zhang@Sun.COM * in either case, and can be viewed via 'fmdump -eV'.
827570SDavid.Zhang@Sun.COM */
837570SDavid.Zhang@Sun.COMevent ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1;
847570SDavid.Zhang@Sun.COMevent ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1;
857570SDavid.Zhang@Sun.COMevent ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1;
867570SDavid.Zhang@Sun.COMevent ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1;
877570SDavid.Zhang@Sun.COMevent ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1;
887570SDavid.Zhang@Sun.COMevent ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1;
897570SDavid.Zhang@Sun.COM
907570SDavid.Zhang@Sun.COM/*
917570SDavid.Zhang@Sun.COM * For some ereports we let the 'driver-assessment', communicated as part of
927570SDavid.Zhang@Sun.COM * the ereport payload, determine fault .vs. upset via propagation constraints.
937570SDavid.Zhang@Sun.COM */
947570SDavid.Zhang@Sun.COM#define DRIVER_ASSESSMENT_FATAL		\
957570SDavid.Zhang@Sun.COM	    (payloadprop_contains("driver-assessment", "fatal"))
967570SDavid.Zhang@Sun.COM#define DRIVER_ASSESSMENT_NONFATAL	(!DRIVER_ASSESSMENT_FATAL)
977570SDavid.Zhang@Sun.COM
987570SDavid.Zhang@Sun.COM/*
997570SDavid.Zhang@Sun.COM * disk-as-detector: propagations from faults(based on
1007570SDavid.Zhang@Sun.COM * DRIVER_ASSESSMENT_FATAL).
1017570SDavid.Zhang@Sun.COM * We need to set additional fault payloads to indicate fault details.
1027570SDavid.Zhang@Sun.COM * The payload we may need are listed as following:
1037570SDavid.Zhang@Sun.COM * fault.io.scsi.cmd.disk.dev.rqs.derr
1047570SDavid.Zhang@Sun.COM *     op_code, key, asc, ascq
1057570SDavid.Zhang@Sun.COM * fault.io.scsi.cmd.disk.dev.rqs.merr
1067570SDavid.Zhang@Sun.COM *     op_code, key, asc, ascq, lba
1077570SDavid.Zhang@Sun.COM */
1087570SDavid.Zhang@Sun.COMprop fault.io.scsi.cmd.disk.dev.rqs.derr@P->
1097570SDavid.Zhang@Sun.COM    ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL &&
1107570SDavid.Zhang@Sun.COM    setpayloadprop("key", payloadprop("key")) &&
1117570SDavid.Zhang@Sun.COM    setpayloadprop("asc", payloadprop("asc")) &&
1127570SDavid.Zhang@Sun.COM    setpayloadprop("ascq", payloadprop("ascq"))};
1137570SDavid.Zhang@Sun.COM
11411031SDavid.Zhang@Sun.COM/*
11511031SDavid.Zhang@Sun.COM * Utilize setserdsuffix with specific LBA,
11611031SDavid.Zhang@Sun.COM * the serd engine would only trigger if the fault recurred on the same LBA
11711031SDavid.Zhang@Sun.COM */
1187570SDavid.Zhang@Sun.COMprop fault.io.scsi.cmd.disk.dev.rqs.merr@P->
1197570SDavid.Zhang@Sun.COM    ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL &&
12011031SDavid.Zhang@Sun.COM    setserdsuffix(payloadprop("lba")) &&
1217570SDavid.Zhang@Sun.COM    setpayloadprop("key", payloadprop("key")) &&
1227570SDavid.Zhang@Sun.COM    setpayloadprop("asc", payloadprop("asc")) &&
1237570SDavid.Zhang@Sun.COM    setpayloadprop("ascq", payloadprop("ascq")) &&
1247570SDavid.Zhang@Sun.COM    setpayloadprop("lba", payloadprop("lba"))};
1257570SDavid.Zhang@Sun.COM
1267570SDavid.Zhang@Sun.COM/*
12711031SDavid.Zhang@Sun.COM * NOTE: this propagation uses the "may" propagation of eversholt.
12811031SDavid.Zhang@Sun.COM * The ereport need never exist. It's just a way of making
12911031SDavid.Zhang@Sun.COM * the diagnosis wait for the within time on that ereport
13011031SDavid.Zhang@Sun.COM * to complete. Once it has completed the diagnosis continues
13111031SDavid.Zhang@Sun.COM * even though the dummy ereport didn't occur.
13211031SDavid.Zhang@Sun.COM */
13311031SDavid.Zhang@Sun.COMevent ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)};
13411031SDavid.Zhang@Sun.COMprop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) ->
13511031SDavid.Zhang@Sun.COM	ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P;
13611031SDavid.Zhang@Sun.COM
13711031SDavid.Zhang@Sun.COM/*
1387570SDavid.Zhang@Sun.COM * The uderr fault will be propagated at some future time.
1397570SDavid.Zhang@Sun.COM * prop fault.io.scsi.cmd.disk.dev.uderr@P->
1407570SDavid.Zhang@Sun.COM *     ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL };
1417570SDavid.Zhang@Sun.COM */
1427570SDavid.Zhang@Sun.COM
1437570SDavid.Zhang@Sun.COM/*
1447570SDavid.Zhang@Sun.COM * disk-as-detector: propagations from upsets(based on
1457570SDavid.Zhang@Sun.COM * DRIVER_ASSESSMENT_NONFATAL).
1467570SDavid.Zhang@Sun.COM */
1477570SDavid.Zhang@Sun.COMprop upset.io.scsi.cmd.disk.dev.rqs.derr@P->
1487570SDavid.Zhang@Sun.COM    ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL };
1497570SDavid.Zhang@Sun.COM
1507570SDavid.Zhang@Sun.COMprop upset.io.scsi.cmd.disk.dev.rqs.merr@P->
1517570SDavid.Zhang@Sun.COM    ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL };
1527570SDavid.Zhang@Sun.COM
1537570SDavid.Zhang@Sun.COM/*
1547570SDavid.Zhang@Sun.COM * disk-as-detector: propagations from upsets(independent of
1557570SDavid.Zhang@Sun.COM * driver-assessment)
1567570SDavid.Zhang@Sun.COM */
1577570SDavid.Zhang@Sun.COM
1587570SDavid.Zhang@Sun.COMprop upset.io.scsi.cmd.disk.dev.serr@P->
1597570SDavid.Zhang@Sun.COM    ereport.io.scsi.cmd.disk.dev.serr@P;
1607570SDavid.Zhang@Sun.COM
1617570SDavid.Zhang@Sun.COMprop upset.io.scsi.cmd.disk.dev.uderr@P->
1627570SDavid.Zhang@Sun.COM    ereport.io.scsi.cmd.disk.dev.uderr@P;
1637570SDavid.Zhang@Sun.COM
1647570SDavid.Zhang@Sun.COMprop upset.io.scsi.cmd.disk.recovered@P->
1657570SDavid.Zhang@Sun.COM    ereport.io.scsi.cmd.disk.recovered@P;
1667570SDavid.Zhang@Sun.COM
1677570SDavid.Zhang@Sun.COMprop upset.io.scsi.cmd.disk.tran@P->
1687570SDavid.Zhang@Sun.COM    ereport.io.scsi.cmd.disk.tran@P;
1697570SDavid.Zhang@Sun.COM
1707570SDavid.Zhang@Sun.COM/*
1717570SDavid.Zhang@Sun.COM * --------------------------------------
1727570SDavid.Zhang@Sun.COM * The remainder of this file contains rules associated with the operation of
1737570SDavid.Zhang@Sun.COM * cmd/fm/modules/common/disk-monitor/disk_monitor.c code.
1747570SDavid.Zhang@Sun.COM *
1757570SDavid.Zhang@Sun.COM * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events
1767570SDavid.Zhang@Sun.COM * generated by the disk-transport fmd module, and the resulting faults.
1777570SDavid.Zhang@Sun.COM */
1787570SDavid.Zhang@Sun.COM
1797570SDavid.Zhang@Sun.COM/*
1804198Seschrock * Fault events.
1814198Seschrock */
1824198Seschrockevent fault.io.disk.over-temperature@P,
1834198Seschrock    FITrate=10, FRU=P, ASRU=P;
1844198Seschrockevent fault.io.disk.predictive-failure@P, FITrate=10,
1854198Seschrock    FITrate=10, FRU=P, ASRU=P;
1864198Seschrockevent fault.io.disk.self-test-failure@P, FITrate=10,
1874198Seschrock    FITrate=10, FRU=P, ASRU=P;
1884198Seschrock
1894198Seschrock/*
1904198Seschrock * ereports.
1914198Seschrock */
1924198Seschrockevent ereport.io.scsi.disk.over-temperature@P;
1934198Seschrockevent ereport.io.scsi.disk.predictive-failure@P;
1944198Seschrockevent ereport.io.scsi.disk.self-test-failure@P;
1954198Seschrock
1964198Seschrock/*
1974198Seschrock * Propagations.
1984198Seschrock */
1994198Seschrockprop fault.io.disk.over-temperature@P ->
2004198Seschrock    ereport.io.scsi.disk.over-temperature@P;
2014198Seschrock
2024198Seschrockprop fault.io.disk.self-test-failure@P ->
2034198Seschrock    ereport.io.scsi.disk.self-test-failure@P;
2044198Seschrock
2054198Seschrockprop fault.io.disk.predictive-failure@P ->
206*12618SStephen.Hanson@Sun.COM    ereport.io.scsi.disk.predictive-failure@P {
207*12618SStephen.Hanson@Sun.COM    setpayloadprop("asc", payloadprop("additional-sense-code")) &&
208*12618SStephen.Hanson@Sun.COM    setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) };
209